Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix negative WaitGroup counter issue #712

Merged
merged 3 commits into from
Jan 20, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions pulsar/consumer_partition.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ import (
pb "github.com/apache/pulsar-client-go/pulsar/internal/pulsar_proto"
"github.com/apache/pulsar-client-go/pulsar/log"

"go.uber.org/atomic"
uAtomic "go.uber.org/atomic"
)

var (
Expand Down Expand Up @@ -110,10 +110,10 @@ type partitionConsumer struct {

// this is needed for sending ConsumerMessage on the messageCh
parentConsumer Consumer
state atomic.Int32
state uAtomic.Int32
options *partitionConsumerOpts

conn atomic.Value
conn uAtomic.Value

topic string
name string
Expand Down
61 changes: 32 additions & 29 deletions pulsar/producer_partition.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ import (
pb "github.com/apache/pulsar-client-go/pulsar/internal/pulsar_proto"
"github.com/apache/pulsar-client-go/pulsar/log"

ua "go.uber.org/atomic"
uAtomic "go.uber.org/atomic"
)

type producerState int32
Expand All @@ -62,12 +62,12 @@ var (
var errTopicNotFount = "TopicNotFound"

type partitionProducer struct {
state ua.Int32
state uAtomic.Int32
client *client
topic string
log log.Logger

conn atomic.Value
conn uAtomic.Value

options *ProducerOptions
producerName string
Expand Down Expand Up @@ -696,7 +696,7 @@ func (p *partitionProducer) internalFlush(fr *flushRequest) {

pi, ok := p.pendingQueue.PeekLast().(*pendingItem)
if !ok {
fr.waitGroup.Done()
close(fr.doneCh)
return
}

Expand All @@ -709,35 +709,39 @@ func (p *partitionProducer) internalFlush(fr *flushRequest) {
// The last item in the queue has been completed while we were
// looking at it. It's safe at this point to assume that every
// message enqueued before Flush() was called are now persisted
fr.waitGroup.Done()
close(fr.doneCh)
return
}

sendReq := &sendRequest{
msg: nil,
callback: func(id MessageID, message *ProducerMessage, e error) {
fr.err = e
fr.waitGroup.Done()
close(fr.doneCh)
},
}

pi.sendRequests = append(pi.sendRequests, sendReq)
}

func (p *partitionProducer) Send(ctx context.Context, msg *ProducerMessage) (MessageID, error) {
wg := sync.WaitGroup{}
wg.Add(1)

var err error
var msgID MessageID

// use atomic bool to avoid race
isDone := uAtomic.NewBool(false)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is isDone needed? Each call to this function will get it's own channel instance.

Copy link
Member Author

@wolfstudy wolfstudy Jan 19, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, as #711 desc, the callback of send is triggered multiple times, so the error as follows:

panic: sync: negative WaitGroup counter
goroutine 2204 [running]:
sync.(*WaitGroup).Add(0x18dbfc0, 0xc0020d8d40)
sync/waitgroup.go:74 +0x105
sync.(*WaitGroup).Done(...)
sync/waitgroup.go:99
github.com/apache/pulsar-client-go/pulsar.(*partitionProducer).Send.func1({0x1965558, 0xc01a1ab1d0}, 0xc01a841160, {0x0, 0x0})
github.com/apache/[email protected]/pulsar/producer_partition.go:722 +0x7e
github.com/apache/pulsar-client-go/pulsar.(*partitionProducer).ReceivedSendReceipt(0xc001017b00, 0xc017280480)
github.com/apache/[email protected]/pulsar/producer_partition.go:828 +0x7ef
github.com/apache/pulsar-client-go/pulsar/internal.(*connection).handleSendReceipt(0xc001188420, 0xc017280480)
github.com/apache/[email protected]/pulsar/internal/connection.go:673 +0xe8
github.com/apache/pulsar-client-go/pulsar/internal.(*connection).internalReceivedCommand(0xc001188420, 0xc006b31680, {0x0, 0x0})
github.com/apache/[email protected]/pulsar/internal/connection.go:558 +0x14a
github.com/apache/pulsar-client-go/pulsar/internal.(*connection).run(0xc001188420)
github.com/apache/[email protected]/pulsar/internal/connection.go:415 +0x3ba
github.com/apache/pulsar-client-go/pulsar/internal.(*connection).start.func1()
github.com/apache/[email protected]/pulsar/internal/connection.go:227 +0x65
created by github.com/apache/pulsar-client-go/pulsar/internal.(*connection).start
github.com/apache/[email protected]/pulsar/internal/connection.go:223 +0x75

So in the callback of send, we introduce the atomic variable of isDone to ensure that at any time, for a send request, its callback will only be called once

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this callback is still callback more than once, isDone is only check callback is called.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, use CAS to avoid this

doneCh := make(chan struct{})

p.internalSendAsync(ctx, msg, func(ID MessageID, message *ProducerMessage, e error) {
err = e
msgID = ID
wg.Done()
if isDone.CAS(false, true) {
err = e
msgID = ID
close(doneCh)
}
}, true)

wg.Wait()
// wait for send request to finish
<-doneCh
return msgID, err
}

Expand Down Expand Up @@ -849,7 +853,7 @@ func (p *partitionProducer) ReceivedSendReceipt(response *pb.CommandSendReceipt)
}

func (p *partitionProducer) internalClose(req *closeProducer) {
defer req.waitGroup.Done()
defer close(req.doneCh)
if !p.casProducerState(producerReady, producerClosing) {
return
}
Expand Down Expand Up @@ -884,14 +888,15 @@ func (p *partitionProducer) LastSequenceID() int64 {
}

func (p *partitionProducer) Flush() error {
wg := sync.WaitGroup{}
wg.Add(1)

cp := &flushRequest{&wg, nil}
p.eventsChan <- cp
flushReq := &flushRequest{
doneCh: make(chan struct{}),
err: nil,
}
p.eventsChan <- flushReq

wg.Wait()
return cp.err
// wait for the flush request to complete
<-flushReq.doneCh
return flushReq.err
}

func (p *partitionProducer) getProducerState() producerState {
Expand All @@ -914,13 +919,11 @@ func (p *partitionProducer) Close() {
return
}

wg := sync.WaitGroup{}
wg.Add(1)

cp := &closeProducer{&wg}
cp := &closeProducer{doneCh: make(chan struct{})}
p.eventsChan <- cp

wg.Wait()
// wait for close producer request to complete
<-cp.doneCh
}

type sendRequest struct {
Expand All @@ -932,12 +935,12 @@ type sendRequest struct {
}

type closeProducer struct {
waitGroup *sync.WaitGroup
doneCh chan struct{}
}

type flushRequest struct {
waitGroup *sync.WaitGroup
err error
doneCh chan struct{}
err error
}

func (i *pendingItem) Complete() {
Expand Down