Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion common/deliverclient/blocksprovider/bft_deliverer.go
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,9 @@ func (d *BFTDeliverer) Stop() {

d.stopFlag = true
close(d.DoneC)
d.blockReceiver.Stop()
if d.blockReceiver != nil {
d.blockReceiver.Stop()
}
}

func (d *BFTDeliverer) FetchBlocks(source *orderers.Endpoint) {
Expand Down
4 changes: 3 additions & 1 deletion common/deliverclient/blocksprovider/deliverer.go
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,9 @@ func (d *Deliverer) Stop() {

d.stopFlag = true
close(d.DoneC)
d.blockReceiver.Stop()
if d.blockReceiver != nil {
d.blockReceiver.Stop()
}
d.Logger.Info("Deliverer stopped")
}

Expand Down
51 changes: 51 additions & 0 deletions common/deliverclient/blocksprovider/stop_before_connect_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
/*
Copyright IBM Corp. All Rights Reserved.

SPDX-License-Identifier: Apache-2.0
*/

package blocksprovider_test

import (
"testing"

"github.com/hyperledger/fabric-lib-go/common/flogging"
"github.com/hyperledger/fabric/common/deliverclient/blocksprovider"
"github.com/stretchr/testify/require"
)

// TestDelivererStopBeforeConnect proves that calling Stop() before DeliverBlocks()
// has established a connection to an orderer (and therefore before blockReceiver is
// initialised) does not panic.
//
// Prior to the fix, d.blockReceiver was nil in this path and Stop() dereferenced it
// unconditionally, causing a nil-pointer panic. This scenario is reached in
// production whenever a peer renounces gossip leadership
// (gossip/service/gossip_service.go calls StopDeliverForChannel) while the orderer
// is unreachable, which is common during network partitions or orderer maintenance.
func TestDelivererStopBeforeConnect(t *testing.T) {
d := &blocksprovider.Deliverer{
ChannelID: "test-channel",
DoneC: make(chan struct{}),
Logger: flogging.MustGetLogger("blocksprovider.test"),
}

require.NotPanics(t, func() {
d.Stop()
})
}

// TestBFTDelivererStopBeforeConnect is the BFT-path equivalent of the same bug.
// BFTDeliverer.Stop() contained the identical unconditional d.blockReceiver.Stop()
// call at bft_deliverer.go without a nil guard.
func TestBFTDelivererStopBeforeConnect(t *testing.T) {
d := &blocksprovider.BFTDeliverer{
ChannelID: "test-channel",
DoneC: make(chan struct{}),
Logger: flogging.MustGetLogger("BFTDeliverer.test"),
}

require.NotPanics(t, func() {
d.Stop()
})
}
60 changes: 60 additions & 0 deletions diff.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
diff --git a/gossip/discovery/discovery_test.go b/gossip/discovery/discovery_test.go
index 01b80fc4f..d200cc83b 100644
--- a/gossip/discovery/discovery_test.go
+++ b/gossip/discovery/discovery_test.go
@@ -1975,3 +1975,55 @@ func TestHandleAliveMessage_RelearnsMemberAfterConcurrentPurge(t *testing.T) {
require.True(t, inID2Member, "member should be present in id2Member after re-learning")
require.True(t, inAliveLastTS, "member should be present in aliveLastTS after re-learning")
}
+
+func TestLearnExistingMembersNilMemberPanic(t *testing.T) {
+ // 1) Initialize a discovery instance (use existing helpers like createDiscoveryInstanceWithNoGossip).
+ inst := createDiscoveryInstanceWithNoGossip(10000, "testInst", nil)
+ defer inst.Stop()
+
+ // Access the underlying implementation
+ discImpl := inst.discoveryImpl()
+
+ // 2) Prepare a PKIid and endpoint.
+ pkiID := common.PKIidType("test-pki-id")
+ endpoint := "localhost:1234"
+
+ // 3) Under lock, insert an entry into aliveLastTS for that PKIid.
+ // 4) Do NOT insert the corresponding entry into id2Member (simulate it being purged).
+ discImpl.lock.Lock()
+ discImpl.aliveLastTS[string(pkiID)] = &timestamp{
+ incTime: time.Now(),
+ seqNum: 1,
+ lastSeen: time.Now(),
+ }
+ discImpl.lock.Unlock()
+
+ // 5) Build a valid AliveMessage and wrap it with protoext.NoopSign.
+ aliveMsg := &proto.GossipMessage{
+ Tag: proto.GossipMessage_EMPTY,
+ Content: &proto.GossipMessage_AliveMsg{
+ AliveMsg: &proto.AliveMessage{
+ Membership: &proto.Member{
+ PkiId: pkiID,
+ Endpoint: endpoint,
+ },
+ Timestamp: &proto.PeerTime{
+ IncNum: uint64(time.Now().UnixNano()),
+ SeqNum: 2,
+ },
+ },
+ },
+ }
+ signedMsg, err := protoext.NoopSign(aliveMsg)
+ require.NoError(t, err)
+
+ // 6) Invoke the relevant flow directly.
+ // We call learnExistingMembers() instead of handleAliveMessage() because handleAliveMessage
+ // checks if the member is in id2Member under a read-lock and drops the message if not.
+ // In the exact concurrency flaw, the member is valid during handleAliveMessage's read lock,
+ // but gets purged before learnExistingMembers acquires its write lock. Calling learnExistingMembers
+ // directly avoids flaky timing races while perfectly recreating the problematic inconsistent state.
+ require.NotPanics(t, func() {
+ discImpl.learnExistingMembers([]*protoext.SignedGossipMessage{signedMsg})
+ }, "learnExistingMembers should not panic when member is nil in id2Member")
+}
65 changes: 65 additions & 0 deletions diff_v2.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
diff --git a/gossip/discovery/discovery_test.go b/gossip/discovery/discovery_test.go
index 01b80fc4f..34d0f968b 100644
--- a/gossip/discovery/discovery_test.go
+++ b/gossip/discovery/discovery_test.go
@@ -1975,3 +1975,60 @@ func TestHandleAliveMessage_RelearnsMemberAfterConcurrentPurge(t *testing.T) {
require.True(t, inID2Member, "member should be present in id2Member after re-learning")
require.True(t, inAliveLastTS, "member should be present in aliveLastTS after re-learning")
}
+
+func TestLearnExistingMembers_NilMemberAfterConcurrentPurge(t *testing.T) {
+ // 1) Initialize a discovery instance (use existing helpers like createDiscoveryInstanceWithNoGossip).
+ inst := createDiscoveryInstanceWithNoGossip(10000, "testInst", nil)
+ defer inst.Stop()
+
+ // Access the underlying implementation
+ d := inst.discoveryImpl()
+
+ // 2) Prepare a PKIid and endpoint.
+ pkiID := common.PKIidType("test-pki-id")
+ endpoint := "localhost:1234"
+
+ // 3) Under lock, insert an entry into aliveLastTS for that PKIid.
+ // 4) Do NOT insert the corresponding entry into id2Member (simulate it being purged).
+ d.lock.Lock()
+ d.aliveLastTS[string(pkiID)] = &timestamp{
+ incTime: time.Now(),
+ seqNum: 1,
+ lastSeen: time.Now(),
+ }
+ d.lock.Unlock()
+
+ // 5) Build a valid AliveMessage and wrap it with protoext.NoopSign.
+ aliveMsg := &proto.GossipMessage{
+ Tag: proto.GossipMessage_EMPTY,
+ Content: &proto.GossipMessage_AliveMsg{
+ AliveMsg: &proto.AliveMessage{
+ Membership: &proto.Member{
+ PkiId: pkiID,
+ Endpoint: endpoint,
+ },
+ Timestamp: &proto.PeerTime{
+ IncNum: uint64(time.Now().UnixNano()),
+ SeqNum: 2,
+ },
+ },
+ },
+ }
+ signedMsg, err := protoext.NoopSign(aliveMsg)
+ require.NoError(t, err)
+
+ // We invoke learnExistingMembers() directly to deterministically reproduce
+ // the inconsistent state where the member is present in aliveLastTS but
+ // missing from id2Member.
+ //
+ // In the real flow, handleAliveMessage first reads state under a read lock,
+ // and then learnExistingMembers acquires a write lock. A concurrent purge
+ // can remove the member between these two steps, leading to a nil access.
+ //
+ // Reproducing this via the full handleAliveMessage path would require a
+ // timing-dependent race, so we simulate the exact post-condition directly
+ // to keep the test deterministic and reliable.
+ require.NotPanics(t, func() {
+ d.learnExistingMembers([]*protoext.SignedGossipMessage{signedMsg})
+ }, "learnExistingMembers should not panic when member is nil in id2Member")
+}
4 changes: 4 additions & 0 deletions gossip/discovery/discovery_impl.go
Original file line number Diff line number Diff line change
Expand Up @@ -843,6 +843,10 @@ func (d *gossipDiscoveryImpl) learnExistingMembers(aliveArr []*protoext.SignedGo

// update member's data
member := d.id2Member[string(am.Membership.PkiId)]
if member == nil {
// skip safely, do not panic
continue
}
member.Endpoint = am.Membership.Endpoint
member.Metadata = am.Membership.Metadata
member.InternalEndpoint = internalEndpoint
Expand Down
52 changes: 52 additions & 0 deletions gossip/discovery/discovery_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1975,3 +1975,55 @@ func TestHandleAliveMessage_RelearnsMemberAfterConcurrentPurge(t *testing.T) {
require.True(t, inID2Member, "member should be present in id2Member after re-learning")
require.True(t, inAliveLastTS, "member should be present in aliveLastTS after re-learning")
}

func TestLearnExistingMembers_NilMemberAfterConcurrentPurge(t *testing.T) {
inst := createDiscoveryInstanceWithNoGossip(0, "testNilMemberAfterPurgeInst", nil)
defer func() {
inst.Stop()
time.Sleep(50 * time.Millisecond)
}()

d := inst.discoveryImpl()

pkiID := common.PKIidType("test-pki-id")
endpoint := "localhost:1234"

// This test simulates the post-condition of a real race.
// In the real flow, handleAliveMessage reads under RLock and later calls
// learnExistingMembers under Lock. A concurrent purge can remove the member
// between these two phases, leaving it present in aliveLastTS but missing
// from id2Member.
//
// Calling learnExistingMembers directly avoids timing-dependent races
// while deterministically reproducing the inconsistent state.
d.lock.Lock()
d.aliveLastTS[string(pkiID)] = &timestamp{
incTime: time.Now(),
seqNum: 1,
lastSeen: time.Now(),
}
d.lock.Unlock()

aliveMsg := &proto.GossipMessage{
Tag: proto.GossipMessage_EMPTY,
Content: &proto.GossipMessage_AliveMsg{
AliveMsg: &proto.AliveMessage{
Membership: &proto.Member{
PkiId: pkiID,
Endpoint: endpoint,
},
Timestamp: &proto.PeerTime{
IncNum: uint64(time.Now().UnixNano()),
SeqNum: 2,
},
},
},
}

signedMsg, err := protoext.NoopSign(aliveMsg)
require.NoError(t, err)

require.NotPanics(t, func() {
d.learnExistingMembers([]*protoext.SignedGossipMessage{signedMsg})
}, "learnExistingMembers should not panic when a member is missing from id2Member")
}
34 changes: 34 additions & 0 deletions test_output.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
=== RUN TestLearnExistingMembersNilMemberPanic
discovery_test.go:2026:
Error Trace: /home/aditya/fabric/gossip/discovery/discovery_test.go:2026
Error: func (assert.PanicTestFunc)(0xba7a00) should not panic
Panic value: runtime error: invalid memory address or nil pointer dereference
Panic stack: goroutine 40 [running]:
runtime/debug.Stack()
/home/aditya/go/pkg/mod/golang.org/toolchain@v0.0.1-go1.26.1.linux-amd64/src/runtime/debug/stack.go:26 +0x5e
github.com/stretchr/testify/assert.didPanic.func1()
/home/aditya/fabric/vendor/github.com/stretchr/testify/assert/assertions.go:1234 +0x67
panic({0xcecc20?, 0x15dddc0?})
/home/aditya/go/pkg/mod/golang.org/toolchain@v0.0.1-go1.26.1.linux-amd64/src/runtime/panic.go:860 +0x13a
github.com/hyperledger/fabric/gossip/discovery.(*gossipDiscoveryImpl).learnExistingMembers(0x1066f128c420, {0x1066f1392378, 0x1, 0x1})
/home/aditya/fabric/gossip/discovery/discovery_impl.go:846 +0x34c
github.com/hyperledger/fabric/gossip/discovery.TestLearnExistingMembersNilMemberPanic.func1()
/home/aditya/fabric/gossip/discovery/discovery_test.go:2027 +0x65
github.com/stretchr/testify/assert.didPanic(0x15e0ea0?)
/home/aditya/fabric/vendor/github.com/stretchr/testify/assert/assertions.go:1239 +0x70
github.com/stretchr/testify/assert.NotPanics({0xe52900, 0x1066f157ab48}, 0x1066f12fa360, {0x1066f12f1f48, 0x1, 0x1})
/home/aditya/fabric/vendor/github.com/stretchr/testify/assert/assertions.go:1310 +0x7e
github.com/stretchr/testify/require.NotPanics({0xe58588, 0x1066f157ab48}, 0x1066f12fa360, {0x1066f12f1f48, 0x1, 0x1})
/home/aditya/fabric/vendor/github.com/stretchr/testify/require/require.go:1669 +0xa6
github.com/hyperledger/fabric/gossip/discovery.TestLearnExistingMembersNilMemberPanic(0x1066f157ab48)
/home/aditya/fabric/gossip/discovery/discovery_test.go:2026 +0x4f3
testing.tRunner(0x1066f157ab48, 0xe47c10)
/home/aditya/go/pkg/mod/golang.org/toolchain@v0.0.1-go1.26.1.linux-amd64/src/testing/testing.go:2036 +0xea
created by testing.(*T).Run in goroutine 1
/home/aditya/go/pkg/mod/golang.org/toolchain@v0.0.1-go1.26.1.linux-amd64/src/testing/testing.go:2101 +0x4c5
Test: TestLearnExistingMembersNilMemberPanic
Messages: learnExistingMembers should not panic when member is nil in id2Member
--- FAIL: TestLearnExistingMembersNilMemberPanic (0.00s)
FAIL
FAIL github.com/hyperledger/fabric/gossip/discovery 0.028s
FAIL