Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions gossip/discovery/discovery_impl.go
Original file line number Diff line number Diff line change
Expand Up @@ -505,7 +505,11 @@

d.lock.RLock()
_, known := d.id2Member[string(pkiID)]
<<<<<<< fix-gossip-discovery-toctou-race

Check failure on line 508 in gossip/discovery/discovery_impl.go

View workflow job for this annotation

GitHub Actions / Basic Checks

expected statement, found '<<'
lastAliveTS, isAlive := d.aliveLastTS[string(pkiID)]
=======
_, isAlive := d.aliveLastTS[string(pkiID)]
>>>>>>> main
lastDeadTS, isDead := d.deadLastTS[string(pkiID)]
d.lock.RUnlock()

Expand Down Expand Up @@ -535,10 +539,6 @@
return
}

d.lock.RLock()
lastAliveTS, isAlive := d.aliveLastTS[string(pkiID)]
d.lock.RUnlock()

if isAlive {
if before(lastAliveTS, ts) {
d.learnExistingMembers([]*protoext.SignedGossipMessage{m})
Expand Down Expand Up @@ -843,6 +843,10 @@

// update member's data
member := d.id2Member[string(am.Membership.PkiId)]
if member == nil {
d.logger.Debugf("Member with PkiId %x was purged during alive message processing, skipping update", am.Membership.PkiId)
continue
}
member.Endpoint = am.Membership.Endpoint
member.Metadata = am.Membership.Metadata
member.InternalEndpoint = internalEndpoint
Expand Down
39 changes: 39 additions & 0 deletions gossip/discovery/discovery_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1928,7 +1928,32 @@
return int(port)
}


func TestLearnExistingMembers_NilMemberAfterConcurrentPurge(t *testing.T) {
inst := createDiscoveryInstanceWithNoGossip(33900, "d0", nil)
defer inst.Stop()

d := inst.discoveryImpl()

memberPKIid := common.PKIidType("purged-member-pkiid")
memberEndpoint := "localhost:9999"

// Simulate the TOCTOU race fixed in PR #5397:
// handleAliveMessage sees isAlive=true under one lock, then a concurrent
// purge() removes the member from all maps before learnExistingMembers
// acquires the write lock. The result is a member present in aliveLastTS
// at decision time but absent from id2Member when learnExistingMembers runs.
d.lock.Lock()
d.aliveLastTS[string(memberPKIid)] = &timestamp{
incTime: time.Now(),
seqNum: 1,
lastSeen: time.Now(),
}
// Intentionally NOT adding to id2Member to reproduce the nil dereference:
// before the fix, learnExistingMembers accessed member.Endpoint without
// a nil guard, causing a panic.
d.lock.Unlock()
func TestHandleAliveMessage_RelearnsMemberAfterConcurrentPurge(t *testing.T) {

Check failure on line 1956 in gossip/discovery/discovery_test.go

View workflow job for this annotation

GitHub Actions / Basic Checks

expected '(', found TestHandleAliveMessage_RelearnsMemberAfterConcurrentPurge
inst := createDiscoveryInstanceWithNoGossip(33700, "d0", []string{})
defer inst.Stop()

Expand All @@ -1954,8 +1979,22 @@
},
Timestamp: &proto.PeerTime{
IncNum: uint64(time.Now().UnixNano()),
SeqNum: 2,
},
},
},
}
signedMsg, err := protoext.NoopSign(aliveMsg)
require.NoError(t, err)

// Before the fix: panics with nil pointer dereference on member.Endpoint
// because id2Member[memberPKIid] is nil (member was concurrently purged).
// After the fix: nil member detected, update skipped, no panic.
require.NotPanics(t, func() {
d.learnExistingMembers([]*protoext.SignedGossipMessage{signedMsg})
})
SeqNum: 1,

Check failure on line 1996 in gossip/discovery/discovery_test.go

View workflow job for this annotation

GitHub Actions / Basic Checks

expected 1 expression
},

Check failure on line 1997 in gossip/discovery/discovery_test.go

View workflow job for this annotation

GitHub Actions / Basic Checks

expected operand, found '}'
},
},
}
Expand All @@ -1974,4 +2013,4 @@

require.True(t, inID2Member, "member should be present in id2Member after re-learning")
require.True(t, inAliveLastTS, "member should be present in aliveLastTS after re-learning")
}

Check failure on line 2016 in gossip/discovery/discovery_test.go

View workflow job for this annotation

GitHub Actions / Basic Checks

expected ';', found 'EOF'
Loading