Skip to content

Commit

Permalink
Merge #32733
Browse files Browse the repository at this point in the history
32733: kv, sql: replace TestTraceFromErrorReplica r=andreimatei a=andreimatei

That test was teminally confused and confusing. It was written to
prevent a regression from a time where the DistSender used to have a bug
causing it to not ingest snowball trace info for errors for which it
performed retries (e.g. NotLeaseHolderError). That bug was repaired by
pushing the ingesting logic from the DistSender down to the gRPC
transport. The test I wrote at the time was written at a very high level - using 
the SQL interface. If you look at it with generousity it might
have seemed like a good idea at the time, but particuarly with the
advent of DistSQL, the test became way confused.
It worked by using ManualReplication (which meant that, consciensiously
or not, the test was getting a single range). It then created a table
and controlled its leaseholder, and then traced a select from a node
that was not aware of this leaseholder, and asserted it was seeing a
NLHE in the trace.
The intent was from the error to come from a Scan. However, at least
since DistSQL happened, the error was coming from the planning process
(creating the physical plan), from a range resolving operation (because
the node doing the select became unaware of the leases for the meta
ranges too incidentally), not from a TableReader. Although arguably the
test was still proving what it wanted to prove, it was confusing af.

Also, the test is starting to hopelessly break with a different thing I
have in the works where clusters get more than one system range even
when used with ManualReplication.

The patch replaces the test with a unit test for the gRPC transport.

Release note: None

Co-authored-by: Andrei Matei <[email protected]>
  • Loading branch information
craig[bot] and andreimatei committed Dec 3, 2018
2 parents a3bf5c8 + 3470058 commit 7174fdd
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 59 deletions.
75 changes: 75 additions & 0 deletions pkg/kv/transport_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,18 @@
package kv

import (
"context"
"fmt"
"testing"

"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/testutils"
"github.com/cockroachdb/cockroach/pkg/util/caller"
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/tracing"
opentracing "github.com/opentracing/opentracing-go"
"google.golang.org/grpc"
)

func TestTransportMoveToFront(t *testing.T) {
Expand Down Expand Up @@ -97,3 +104,71 @@ func TestTransportMoveToFront(t *testing.T) {
t.Fatalf("expected client index 1; got %d", gt.clientIndex)
}
}

// TestSpanImport tests that the gRPC transport ingests trace information that
// came from gRPC responses (through the "snowball tracing" mechanism).
func TestSpanImport(t *testing.T) {
defer leaktest.AfterTest(t)()
ctx := context.Background()
metrics := makeDistSenderMetrics()
gt := grpcTransport{
opts: SendOptions{
metrics: &metrics,
},
}
server := mockInternalClient{}
// Let's spice things up and simulate an error from the server.
expectedErr := "my expected error"
server.pErr = roachpb.NewErrorf(expectedErr)

recCtx, getRec, cancel := tracing.ContextWithRecordingSpan(ctx, "test")
defer cancel()

server.tr = opentracing.SpanFromContext(recCtx).Tracer().(*tracing.Tracer)

br, err := gt.sendBatch(recCtx, roachpb.NodeID(1), &server, roachpb.BatchRequest{})
if err != nil {
t.Fatal(err)
}
if !testutils.IsPError(br.Error, expectedErr) {
t.Fatalf("expected err: %s, got: %q", expectedErr, br.Error)
}
expectedMsg := "mockInternalClient processing batch"
if tracing.FindMsgInRecording(getRec(), expectedMsg) == -1 {
t.Fatalf("didn't find expected message in trace: %s", expectedMsg)
}
}

// mockInternalClient is an implementation of roachpb.InternalClient.
// It simulates aspects of how the Node normally handles tracing in gRPC calls.
type mockInternalClient struct {
tr *tracing.Tracer
pErr *roachpb.Error
}

var _ roachpb.InternalClient = &mockInternalClient{}

// Batch is part of the roachpb.InternalClient interface.
func (m *mockInternalClient) Batch(
ctx context.Context, in *roachpb.BatchRequest, opts ...grpc.CallOption,
) (*roachpb.BatchResponse, error) {
sp := m.tr.StartRootSpan("mock", nil /* logTags */, tracing.RecordableSpan)
defer sp.Finish()
tracing.StartRecording(sp, tracing.SnowballRecording)
ctx = opentracing.ContextWithSpan(ctx, sp)

log.Eventf(ctx, "mockInternalClient processing batch")
br := &roachpb.BatchResponse{}
br.Error = m.pErr
if rec := tracing.GetRecording(sp); rec != nil {
br.CollectedSpans = append(br.CollectedSpans, rec...)
}
return br, nil
}

// RangeFeed is part of the roachpb.InternalClient interface.
func (m *mockInternalClient) RangeFeed(
ctx context.Context, in *roachpb.RangeFeedRequest, opts ...grpc.CallOption,
) (roachpb.Internal_RangeFeedClient, error) {
return nil, fmt.Errorf("unsupported RangeFeed call")
}
59 changes: 0 additions & 59 deletions pkg/sql/trace_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -554,62 +554,3 @@ func TestKVTraceDistSQL(t *testing.T) {
t.Fatal(err)
}
}

// Test that spans are collected from RPC that returned (structured) errors, in
// addition to the spans from the successful retry of the RPC.
func TestTraceFromErrorReplica(t *testing.T) {
defer leaktest.AfterTest(t)()

// We're going to set up a scenario in which a node is not aware of the
// leaseholder for a range: the replicas are on n1,n2,n3 with the lease on n2.
// A query originating from n4 will initially contact n1, which will redirect
// to n2. We test that we have collected the spans from n1.

const numNodes = 4
cluster := serverutils.StartTestCluster(t, numNodes, base.TestClusterArgs{
ReplicationMode: base.ReplicationManual,
ServerArgs: base.TestServerArgs{
UseDatabase: "test",
},
})
defer cluster.Stopper().Stop(context.TODO())
clusterDB := cluster.ServerConn(0)
n4 := cluster.ServerConn(3)

if _, err := clusterDB.Exec(`
CREATE DATABASE test;
CREATE TABLE test.foo (id INT PRIMARY KEY);
`); err != nil {
t.Fatal(err)
}

// Do a read on the table through n4 so that it populates its lease cache.
if _, err := n4.Exec("SELECT * FROM foo"); err != nil {
t.Fatal(err)
}

// Replicate foo on n1,n2,n3 with the lease on n2.
if _, err := clusterDB.Exec(`
ALTER TABLE test.foo EXPERIMENTAL_RELOCATE VALUES (ARRAY[2,1,3], 1);
`); err != nil {
t.Fatal(err)
}

// Query through n4 and look for a redirect log message from n1.
if _, err := n4.Exec("SET tracing = on; SELECT * FROM foo; SET tracing = off"); err != nil {
t.Fatal(err)
}
rows, err := n4.Query(
`SELECT tag, message
FROM [SHOW TRACE FOR SESSION]
WHERE tag LIKE '%n1%' AND
message LIKE '%NotLeaseHolderError%'
`)
if err != nil {
t.Fatal(err)
}
defer rows.Close()
if !rows.Next() {
t.Fatal("no redirect message from n1 found")
}
}

0 comments on commit 7174fdd

Please sign in to comment.