5
5
"errors"
6
6
"fmt"
7
7
"net/netip"
8
+ "strings"
8
9
"sync"
9
10
"sync/atomic"
10
11
"time"
@@ -26,6 +27,21 @@ const MappingDuration = time.Minute
26
27
// CacheTime is the time a mapping will cache an external address for
27
28
const CacheTime = 15 * time .Second
28
29
30
+ // DiscoveryTimeout is the maximum time to wait for NAT discovery.
31
+ // This is based on the underlying UPnP and NAT-PMP/PCP protocols:
32
+ // - SSDP (UPnP discovery) waits 5 seconds for responses
33
+ // - NAT-PMP uses exponential backoff starting at 250ms, up to 9 retries
34
+ // (total ~32 seconds if exhausted, but typically responds in 1-2 seconds)
35
+ // - PCP follows similar timing to NAT-PMP
36
+ // - 10 seconds covers common cases while failing fast when no NAT exists
37
+ const DiscoveryTimeout = 10 * time .Second
38
+
39
+ // rediscoveryThreshold is the number of consecutive connection failures
40
+ // before triggering NAT rediscovery. We ignore first few failures to
41
+ // distinguish between transient network issues and persistent router
42
+ // problems like restarts or port changes that require finding the NAT device again.
43
+ const rediscoveryThreshold = 3
44
+
29
45
type entry struct {
30
46
protocol string
31
47
port int
@@ -40,18 +56,15 @@ func DiscoverNAT(ctx context.Context) (*NAT, error) {
40
56
if err != nil {
41
57
return nil , err
42
58
}
43
- var extAddr netip.Addr
44
- extIP , err := natInstance .GetExternalAddress ()
45
- if err == nil {
46
- extAddr , _ = netip .AddrFromSlice (extIP )
47
- }
59
+
60
+ extAddr := getExternalAddress (natInstance )
48
61
49
62
// Log the device addr.
50
63
addr , err := natInstance .GetDeviceAddress ()
51
64
if err != nil {
52
- log .Debug ("DiscoverGateway address error" , "err" , err )
65
+ log .Warn ("DiscoverGateway address error" , "err" , err )
53
66
} else {
54
- log .Debug ("DiscoverGateway address" , "address" , addr )
67
+ log .Info ("DiscoverGateway address" , "address" , addr )
55
68
}
56
69
57
70
ctx , cancel := context .WithCancel (context .Background ())
@@ -74,19 +87,35 @@ func DiscoverNAT(ctx context.Context) (*NAT, error) {
74
87
// NATs (Network Address Translators). It is a long-running
75
88
// service that will periodically renew port mappings,
76
89
// and keep an up-to-date list of all the external addresses.
90
+ //
91
+ // Locking strategy:
92
+ // - natmu: Protects nat instance and rediscovery state (nat, consecutiveFailures, rediscovering)
93
+ // - mappingmu: Protects port mappings table and closed flag (mappings, closed)
94
+ // - Lock ordering: When both locks are needed, always acquire mappingmu before natmu
95
+ // to prevent deadlocks
96
+ // - We use separate mutexes because the NAT instance may change (e.g., when router
97
+ // restarts and UPnP port changes), but the port mappings must persist and be
98
+ // re-applied across all instances. This separation allows the mappings table to
99
+ // remain stable while the underlying NAT device changes.
77
100
type NAT struct {
78
101
natmu sync.Mutex
79
102
nat nat.NAT
103
+
104
+ // Track connection failures for auto-rediscovery
105
+ consecutiveFailures int // protected by natmu
106
+ rediscovering bool // protected by natmu
107
+
80
108
// External IP of the NAT. Will be renewed periodically (every CacheTime).
81
109
extAddr atomic.Pointer [netip.Addr ]
82
110
83
111
refCount sync.WaitGroup
84
112
ctx context.Context
85
113
ctxCancel context.CancelFunc
86
114
87
- mappingmu sync.RWMutex // guards mappings
88
- closed bool
89
- mappings map [entry ]int
115
+ // Port mappings that should persist across NAT instance changes
116
+ mappingmu sync.RWMutex
117
+ closed bool // protected by mappingmu
118
+ mappings map [entry ]int // protected by mappingmu
90
119
}
91
120
92
121
// Close shuts down all port mappings. NAT can no longer be used.
@@ -149,11 +178,14 @@ func (nat *NAT) AddMapping(ctx context.Context, protocol string, port int) error
149
178
func (nat * NAT ) RemoveMapping (ctx context.Context , protocol string , port int ) error {
150
179
nat .mappingmu .Lock ()
151
180
defer nat .mappingmu .Unlock ()
181
+ nat .natmu .Lock ()
182
+ defer nat .natmu .Unlock ()
152
183
153
184
switch protocol {
154
185
case "tcp" , "udp" :
155
186
e := entry {protocol : protocol , port : port }
156
187
if _ , ok := nat .mappings [e ]; ok {
188
+ log .Info ("Stopping maintenance of port mapping" , "protocol" , protocol , "port" , port )
157
189
delete (nat .mappings , e )
158
190
return nat .nat .DeletePortMapping (ctx , protocol , port )
159
191
}
@@ -164,6 +196,12 @@ func (nat *NAT) RemoveMapping(ctx context.Context, protocol string, port int) er
164
196
}
165
197
166
198
func (nat * NAT ) background () {
199
+ // Renew port mappings every 20 seconds (1/3 of 60s lifetime).
200
+ // - NAT-PMP RFC 6886 recommends renewing at 50% of lifetime
201
+ // - We use 33% for added safety against silent lifetime reductions
202
+ // NOTE: This aggressive 60s/20s pattern may be outdated for modern routers
203
+ // but provides quick cleanup and fast failure detection for our rediscovery.
204
+ // TODO: Research longer durations (e.g. 30min/10min) to reduce router load
167
205
const mappingUpdate = MappingDuration / 3
168
206
169
207
now := time .Now ()
@@ -202,8 +240,11 @@ func (nat *NAT) background() {
202
240
nextMappingUpdate = time .Now ().Add (mappingUpdate )
203
241
}
204
242
if now .After (nextAddrUpdate ) {
205
- var extAddr netip. Addr
243
+ nat . natmu . Lock ()
206
244
extIP , err := nat .nat .GetExternalAddress ()
245
+ nat .natmu .Unlock ()
246
+
247
+ var extAddr netip.Addr
207
248
if err == nil {
208
249
extAddr , _ = netip .AddrFromSlice (extIP )
209
250
}
@@ -213,13 +254,16 @@ func (nat *NAT) background() {
213
254
t .Reset (time .Until (minTime (nextAddrUpdate , nextMappingUpdate )))
214
255
case <- nat .ctx .Done ():
215
256
nat .mappingmu .Lock ()
257
+ defer nat .mappingmu .Unlock ()
258
+ nat .natmu .Lock ()
259
+ defer nat .natmu .Unlock ()
260
+
216
261
ctx , cancel := context .WithTimeout (context .Background (), 10 * time .Second )
217
262
defer cancel ()
218
263
for e := range nat .mappings {
219
- delete (nat .mappings , e )
220
264
nat .nat .DeletePortMapping (ctx , e .protocol , e .port )
221
265
}
222
- nat .mappingmu . Unlock ( )
266
+ clear ( nat .mappings )
223
267
return
224
268
}
225
269
}
@@ -229,28 +273,95 @@ func (nat *NAT) establishMapping(ctx context.Context, protocol string, internalP
229
273
log .Debug ("Attempting port map" , "protocol" , protocol , "internal_port" , internalPort )
230
274
const comment = "libp2p"
231
275
276
+ // Try to establish the mapping with both NAT calls under the same lock
232
277
nat .natmu .Lock ()
278
+ defer nat .natmu .Unlock ()
279
+
233
280
var err error
234
281
externalPort , err = nat .nat .AddPortMapping (ctx , protocol , internalPort , comment , MappingDuration )
235
282
if err != nil {
236
283
// Some hardware does not support mappings with timeout, so try that
237
284
externalPort , err = nat .nat .AddPortMapping (ctx , protocol , internalPort , comment , 0 )
238
285
}
239
- nat .natmu .Unlock ()
240
286
241
- if err != nil || externalPort == 0 {
242
- if err != nil {
243
- log .Warn ("NAT port mapping failed" , "protocol" , protocol , "internal_port" , internalPort , "err" , err )
287
+ // Handle success
288
+ if err == nil && externalPort != 0 {
289
+ nat .consecutiveFailures = 0
290
+ log .Debug ("NAT port mapping established" , "protocol" , protocol , "internal_port" , internalPort , "external_port" , externalPort )
291
+ return externalPort
292
+ }
293
+
294
+ // Handle failures
295
+ if err != nil {
296
+ log .Warn ("NAT port mapping failed" , "protocol" , protocol , "internal_port" , internalPort , "err" , err )
297
+
298
+ // Check if this is a connection error that might indicate router restart
299
+ // See: https://github.com/libp2p/go-libp2p/issues/3224#issuecomment-2866844723
300
+ // Note: We use string matching because goupnp doesn't preserve error chains (uses %v instead of %w)
301
+ if strings .Contains (err .Error (), "connection refused" ) {
302
+ nat .consecutiveFailures ++
303
+ if nat .consecutiveFailures >= rediscoveryThreshold && ! nat .rediscovering {
304
+ nat .rediscovering = true
305
+ // Spawn in goroutine to avoid blocking the caller while we
306
+ // perform network discovery, which can take up to 30 seconds.
307
+ // The rediscovering flag prevents multiple concurrent attempts.
308
+ go nat .rediscoverNAT ()
309
+ }
244
310
} else {
245
- log .Warn ("NAT port mapping failed" , "protocol" , protocol , "internal_port" , internalPort , "external_port" , 0 )
311
+ // Reset counter for non-connection errors (transient failures)
312
+ nat .consecutiveFailures = 0
246
313
}
247
- // we do not close if the mapping failed,
248
- // because it may work again next time.
249
314
return 0
250
315
}
251
316
252
- log .Debug ("NAT Mapping" , "external_port" , externalPort , "internal_port" , internalPort , "protocol" , protocol )
253
- return externalPort
317
+ // externalPort is 0 but no error was returned
318
+ log .Warn ("NAT port mapping failed" , "protocol" , protocol , "internal_port" , internalPort , "external_port" , 0 )
319
+ return 0
320
+ }
321
+
322
+ // rediscoverNAT attempts to rediscover the NAT device after connection failures
323
+ func (nat * NAT ) rediscoverNAT () {
324
+ log .Info ("NAT rediscovery triggered due to repeated connection failures" )
325
+
326
+ ctx , cancel := context .WithTimeout (nat .ctx , DiscoveryTimeout )
327
+ defer cancel ()
328
+
329
+ newNATInstance , err := discoverGateway (ctx )
330
+ if err != nil {
331
+ log .Warn ("NAT rediscovery failed" , "err" , err )
332
+ nat .natmu .Lock ()
333
+ defer nat .natmu .Unlock ()
334
+ nat .rediscovering = false
335
+ return
336
+ }
337
+
338
+ extAddr := getExternalAddress (newNATInstance )
339
+
340
+ // Replace the NAT instance
341
+ // No cleanup of the old instance needed because:
342
+ // - Router restart has already wiped all mappings
343
+ // - Old UPnP endpoint is dead (connection refused)
344
+ // - If router didn't actually restart (false positive), any stale mappings
345
+ // on the router expire naturally (60 second UPnP timeout)
346
+ nat .natmu .Lock ()
347
+ nat .nat = newNATInstance
348
+ nat .extAddr .Store (& extAddr )
349
+ nat .consecutiveFailures = 0
350
+ nat .rediscovering = false
351
+ nat .natmu .Unlock ()
352
+
353
+ // Re-establish all existing mappings on the new NAT instance
354
+ nat .mappingmu .Lock ()
355
+ for e := range nat .mappings {
356
+ extPort := nat .establishMapping (nat .ctx , e .protocol , e .port )
357
+ nat .mappings [e ] = extPort
358
+ if extPort != 0 {
359
+ log .Info ("NAT mapping restored after rediscovery" , "protocol" , e .protocol , "internal_port" , e .port , "external_port" , extPort )
360
+ }
361
+ }
362
+ nat .mappingmu .Unlock ()
363
+
364
+ log .Info ("NAT rediscovery successful" )
254
365
}
255
366
256
367
func minTime (a , b time.Time ) time.Time {
@@ -259,3 +370,18 @@ func minTime(a, b time.Time) time.Time {
259
370
}
260
371
return b
261
372
}
373
+
374
+ // getExternalAddress retrieves and parses the external address from a NAT instance
375
+ func getExternalAddress (natInstance nat.NAT ) netip.Addr {
376
+ extIP , err := natInstance .GetExternalAddress ()
377
+ if err != nil {
378
+ log .Debug ("Failed to get external address" , "err" , err )
379
+ return netip.Addr {}
380
+ }
381
+ extAddr , ok := netip .AddrFromSlice (extIP )
382
+ if ! ok {
383
+ log .Debug ("Failed to parse external address" , "ip" , extIP )
384
+ return netip.Addr {}
385
+ }
386
+ return extAddr
387
+ }
0 commit comments