Skip to content

Commit d39a124

Browse files
authored
Merge branch 'master' into add-ipv6-transport-tests
2 parents 1e25baf + 3ecae66 commit d39a124

File tree

7 files changed

+595
-51
lines changed

7 files changed

+595
-51
lines changed

p2p/discovery/mdns/mdns_test.go

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
package mdns
22

33
import (
4+
"os"
5+
"runtime"
46
"sync"
57
"testing"
68
"time"
@@ -47,6 +49,10 @@ func (n *notif) GetPeers() []peer.AddrInfo {
4749
}
4850

4951
func TestOtherDiscovery(t *testing.T) {
52+
if runtime.GOOS != "linux" && os.Getenv("CI") != "" {
53+
t.Skip("this test is flaky on CI outside of linux")
54+
}
55+
5056
const n = 4
5157

5258
notifs := make([]*notif, n)
@@ -91,8 +97,8 @@ func TestOtherDiscovery(t *testing.T) {
9197
}
9298
return true
9399
},
94-
25*time.Second,
95-
5*time.Millisecond,
100+
5*time.Second,
101+
100*time.Millisecond,
96102
"expected peers to find each other",
97103
)
98104
}

p2p/host/basic/natmgr.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ import (
77
"net/netip"
88
"strconv"
99
"sync"
10-
"time"
1110

1211
"github.com/libp2p/go-libp2p/core/network"
1312
inat "github.com/libp2p/go-libp2p/p2p/net/nat"
@@ -105,7 +104,7 @@ func (nmgr *natManager) background(ctx context.Context) {
105104
}
106105
}()
107106

108-
discoverCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
107+
discoverCtx, cancel := context.WithTimeout(ctx, inat.DiscoveryTimeout)
109108
defer cancel()
110109
natInstance, err := discoverNAT(discoverCtx)
111110
if err != nil {

p2p/net/nat/nat.go

Lines changed: 157 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"errors"
66
"fmt"
77
"net/netip"
8+
"strings"
89
"sync"
910
"sync/atomic"
1011
"time"
@@ -26,6 +27,21 @@ const MappingDuration = time.Minute
2627
// CacheTime is the time a mapping will cache an external address for
2728
const CacheTime = 15 * time.Second
2829

30+
// DiscoveryTimeout is the maximum time to wait for NAT discovery.
31+
// This is based on the underlying UPnP and NAT-PMP/PCP protocols:
32+
// - SSDP (UPnP discovery) waits 5 seconds for responses
33+
// - NAT-PMP uses exponential backoff starting at 250ms, up to 9 retries
34+
// (total ~32 seconds if exhausted, but typically responds in 1-2 seconds)
35+
// - PCP follows similar timing to NAT-PMP
36+
// - 10 seconds covers common cases while failing fast when no NAT exists
37+
const DiscoveryTimeout = 10 * time.Second
38+
39+
// rediscoveryThreshold is the number of consecutive connection failures
40+
// before triggering NAT rediscovery. We ignore first few failures to
41+
// distinguish between transient network issues and persistent router
42+
// problems like restarts or port changes that require finding the NAT device again.
43+
const rediscoveryThreshold = 3
44+
2945
type entry struct {
3046
protocol string
3147
port int
@@ -40,18 +56,15 @@ func DiscoverNAT(ctx context.Context) (*NAT, error) {
4056
if err != nil {
4157
return nil, err
4258
}
43-
var extAddr netip.Addr
44-
extIP, err := natInstance.GetExternalAddress()
45-
if err == nil {
46-
extAddr, _ = netip.AddrFromSlice(extIP)
47-
}
59+
60+
extAddr := getExternalAddress(natInstance)
4861

4962
// Log the device addr.
5063
addr, err := natInstance.GetDeviceAddress()
5164
if err != nil {
52-
log.Debug("DiscoverGateway address error", "err", err)
65+
log.Warn("DiscoverGateway address error", "err", err)
5366
} else {
54-
log.Debug("DiscoverGateway address", "address", addr)
67+
log.Info("DiscoverGateway address", "address", addr)
5568
}
5669

5770
ctx, cancel := context.WithCancel(context.Background())
@@ -74,19 +87,35 @@ func DiscoverNAT(ctx context.Context) (*NAT, error) {
7487
// NATs (Network Address Translators). It is a long-running
7588
// service that will periodically renew port mappings,
7689
// and keep an up-to-date list of all the external addresses.
90+
//
91+
// Locking strategy:
92+
// - natmu: Protects nat instance and rediscovery state (nat, consecutiveFailures, rediscovering)
93+
// - mappingmu: Protects port mappings table and closed flag (mappings, closed)
94+
// - Lock ordering: When both locks are needed, always acquire mappingmu before natmu
95+
// to prevent deadlocks
96+
// - We use separate mutexes because the NAT instance may change (e.g., when router
97+
// restarts and UPnP port changes), but the port mappings must persist and be
98+
// re-applied across all instances. This separation allows the mappings table to
99+
// remain stable while the underlying NAT device changes.
77100
type NAT struct {
78101
natmu sync.Mutex
79102
nat nat.NAT
103+
104+
// Track connection failures for auto-rediscovery
105+
consecutiveFailures int // protected by natmu
106+
rediscovering bool // protected by natmu
107+
80108
// External IP of the NAT. Will be renewed periodically (every CacheTime).
81109
extAddr atomic.Pointer[netip.Addr]
82110

83111
refCount sync.WaitGroup
84112
ctx context.Context
85113
ctxCancel context.CancelFunc
86114

87-
mappingmu sync.RWMutex // guards mappings
88-
closed bool
89-
mappings map[entry]int
115+
// Port mappings that should persist across NAT instance changes
116+
mappingmu sync.RWMutex
117+
closed bool // protected by mappingmu
118+
mappings map[entry]int // protected by mappingmu
90119
}
91120

92121
// Close shuts down all port mappings. NAT can no longer be used.
@@ -134,13 +163,21 @@ func (nat *NAT) AddMapping(ctx context.Context, protocol string, port int) error
134163
return errors.New("closed")
135164
}
136165

166+
// Check if the mapping already exists to avoid duplicate work
167+
e := entry{protocol: protocol, port: port}
168+
if _, exists := nat.mappings[e]; exists {
169+
return nil
170+
}
171+
172+
log.Info("Starting maintenance of port mapping", "protocol", protocol, "port", port)
173+
137174
// do it once synchronously, so first mapping is done right away, and before exiting,
138175
// allowing users -- in the optimistic case -- to use results right after.
139176
extPort := nat.establishMapping(ctx, protocol, port)
140177
// Don't validate the mapping here, we refresh the mappings based on this map.
141178
// We can try getting a port again in case it succeeds. In the worst case,
142179
// this is one extra LAN request every few minutes.
143-
nat.mappings[entry{protocol: protocol, port: port}] = extPort
180+
nat.mappings[e] = extPort
144181
return nil
145182
}
146183

@@ -149,11 +186,14 @@ func (nat *NAT) AddMapping(ctx context.Context, protocol string, port int) error
149186
func (nat *NAT) RemoveMapping(ctx context.Context, protocol string, port int) error {
150187
nat.mappingmu.Lock()
151188
defer nat.mappingmu.Unlock()
189+
nat.natmu.Lock()
190+
defer nat.natmu.Unlock()
152191

153192
switch protocol {
154193
case "tcp", "udp":
155194
e := entry{protocol: protocol, port: port}
156195
if _, ok := nat.mappings[e]; ok {
196+
log.Info("Stopping maintenance of port mapping", "protocol", protocol, "port", port)
157197
delete(nat.mappings, e)
158198
return nat.nat.DeletePortMapping(ctx, protocol, port)
159199
}
@@ -164,6 +204,12 @@ func (nat *NAT) RemoveMapping(ctx context.Context, protocol string, port int) er
164204
}
165205

166206
func (nat *NAT) background() {
207+
// Renew port mappings every 20 seconds (1/3 of 60s lifetime).
208+
// - NAT-PMP RFC 6886 recommends renewing at 50% of lifetime
209+
// - We use 33% for added safety against silent lifetime reductions
210+
// NOTE: This aggressive 60s/20s pattern may be outdated for modern routers
211+
// but provides quick cleanup and fast failure detection for our rediscovery.
212+
// TODO: Research longer durations (e.g. 30min/10min) to reduce router load
167213
const mappingUpdate = MappingDuration / 3
168214

169215
now := time.Now()
@@ -202,8 +248,11 @@ func (nat *NAT) background() {
202248
nextMappingUpdate = time.Now().Add(mappingUpdate)
203249
}
204250
if now.After(nextAddrUpdate) {
205-
var extAddr netip.Addr
251+
nat.natmu.Lock()
206252
extIP, err := nat.nat.GetExternalAddress()
253+
nat.natmu.Unlock()
254+
255+
var extAddr netip.Addr
207256
if err == nil {
208257
extAddr, _ = netip.AddrFromSlice(extIP)
209258
}
@@ -213,13 +262,16 @@ func (nat *NAT) background() {
213262
t.Reset(time.Until(minTime(nextAddrUpdate, nextMappingUpdate)))
214263
case <-nat.ctx.Done():
215264
nat.mappingmu.Lock()
265+
defer nat.mappingmu.Unlock()
266+
nat.natmu.Lock()
267+
defer nat.natmu.Unlock()
268+
216269
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
217270
defer cancel()
218271
for e := range nat.mappings {
219-
delete(nat.mappings, e)
220272
nat.nat.DeletePortMapping(ctx, e.protocol, e.port)
221273
}
222-
nat.mappingmu.Unlock()
274+
clear(nat.mappings)
223275
return
224276
}
225277
}
@@ -229,28 +281,95 @@ func (nat *NAT) establishMapping(ctx context.Context, protocol string, internalP
229281
log.Debug("Attempting port map", "protocol", protocol, "internal_port", internalPort)
230282
const comment = "libp2p"
231283

284+
// Try to establish the mapping with both NAT calls under the same lock
232285
nat.natmu.Lock()
286+
defer nat.natmu.Unlock()
287+
233288
var err error
234289
externalPort, err = nat.nat.AddPortMapping(ctx, protocol, internalPort, comment, MappingDuration)
235290
if err != nil {
236291
// Some hardware does not support mappings with timeout, so try that
237292
externalPort, err = nat.nat.AddPortMapping(ctx, protocol, internalPort, comment, 0)
238293
}
239-
nat.natmu.Unlock()
240294

241-
if err != nil || externalPort == 0 {
242-
if err != nil {
243-
log.Warn("NAT port mapping failed", "protocol", protocol, "internal_port", internalPort, "err", err)
295+
// Handle success
296+
if err == nil && externalPort != 0 {
297+
nat.consecutiveFailures = 0
298+
log.Debug("NAT port mapping established", "protocol", protocol, "internal_port", internalPort, "external_port", externalPort)
299+
return externalPort
300+
}
301+
302+
// Handle failures
303+
if err != nil {
304+
log.Warn("NAT port mapping failed", "protocol", protocol, "internal_port", internalPort, "err", err)
305+
306+
// Check if this is a connection error that might indicate router restart
307+
// See: https://github.com/libp2p/go-libp2p/issues/3224#issuecomment-2866844723
308+
// Note: We use string matching because goupnp doesn't preserve error chains (uses %v instead of %w)
309+
if strings.Contains(err.Error(), "connection refused") {
310+
nat.consecutiveFailures++
311+
if nat.consecutiveFailures >= rediscoveryThreshold && !nat.rediscovering {
312+
nat.rediscovering = true
313+
// Spawn in goroutine to avoid blocking the caller while we
314+
// perform network discovery, which can take up to 30 seconds.
315+
// The rediscovering flag prevents multiple concurrent attempts.
316+
go nat.rediscoverNAT()
317+
}
244318
} else {
245-
log.Warn("NAT port mapping failed", "protocol", protocol, "internal_port", internalPort, "external_port", 0)
319+
// Reset counter for non-connection errors (transient failures)
320+
nat.consecutiveFailures = 0
246321
}
247-
// we do not close if the mapping failed,
248-
// because it may work again next time.
249322
return 0
250323
}
251324

252-
log.Debug("NAT Mapping", "external_port", externalPort, "internal_port", internalPort, "protocol", protocol)
253-
return externalPort
325+
// externalPort is 0 but no error was returned
326+
log.Warn("NAT port mapping failed", "protocol", protocol, "internal_port", internalPort, "external_port", 0)
327+
return 0
328+
}
329+
330+
// rediscoverNAT attempts to rediscover the NAT device after connection failures
331+
func (nat *NAT) rediscoverNAT() {
332+
log.Info("NAT rediscovery triggered due to repeated connection failures")
333+
334+
ctx, cancel := context.WithTimeout(nat.ctx, DiscoveryTimeout)
335+
defer cancel()
336+
337+
newNATInstance, err := discoverGateway(ctx)
338+
if err != nil {
339+
log.Warn("NAT rediscovery failed", "err", err)
340+
nat.natmu.Lock()
341+
defer nat.natmu.Unlock()
342+
nat.rediscovering = false
343+
return
344+
}
345+
346+
extAddr := getExternalAddress(newNATInstance)
347+
348+
// Replace the NAT instance
349+
// No cleanup of the old instance needed because:
350+
// - Router restart has already wiped all mappings
351+
// - Old UPnP endpoint is dead (connection refused)
352+
// - If router didn't actually restart (false positive), any stale mappings
353+
// on the router expire naturally (60 second UPnP timeout)
354+
nat.natmu.Lock()
355+
nat.nat = newNATInstance
356+
nat.extAddr.Store(&extAddr)
357+
nat.consecutiveFailures = 0
358+
nat.rediscovering = false
359+
nat.natmu.Unlock()
360+
361+
// Re-establish all existing mappings on the new NAT instance
362+
nat.mappingmu.Lock()
363+
for e := range nat.mappings {
364+
extPort := nat.establishMapping(nat.ctx, e.protocol, e.port)
365+
nat.mappings[e] = extPort
366+
if extPort != 0 {
367+
log.Info("NAT mapping restored after rediscovery", "protocol", e.protocol, "internal_port", e.port, "external_port", extPort)
368+
}
369+
}
370+
nat.mappingmu.Unlock()
371+
372+
log.Info("NAT rediscovery successful")
254373
}
255374

256375
func minTime(a, b time.Time) time.Time {
@@ -259,3 +378,18 @@ func minTime(a, b time.Time) time.Time {
259378
}
260379
return b
261380
}
381+
382+
// getExternalAddress retrieves and parses the external address from a NAT instance
383+
func getExternalAddress(natInstance nat.NAT) netip.Addr {
384+
extIP, err := natInstance.GetExternalAddress()
385+
if err != nil {
386+
log.Debug("Failed to get external address", "err", err)
387+
return netip.Addr{}
388+
}
389+
extAddr, ok := netip.AddrFromSlice(extIP)
390+
if !ok {
391+
log.Debug("Failed to parse external address", "ip", extIP)
392+
return netip.Addr{}
393+
}
394+
return extAddr
395+
}

0 commit comments

Comments
 (0)