5
5
"errors"
6
6
"fmt"
7
7
"net/netip"
8
+ "strings"
8
9
"sync"
9
10
"sync/atomic"
10
11
"time"
@@ -26,6 +27,21 @@ const MappingDuration = time.Minute
26
27
// CacheTime is the time a mapping will cache an external address for
27
28
const CacheTime = 15 * time .Second
28
29
30
+ // DiscoveryTimeout is the maximum time to wait for NAT discovery.
31
+ // This is based on the underlying UPnP and NAT-PMP/PCP protocols:
32
+ // - SSDP (UPnP discovery) waits 5 seconds for responses
33
+ // - NAT-PMP uses exponential backoff starting at 250ms, up to 9 retries
34
+ // (total ~32 seconds if exhausted, but typically responds in 1-2 seconds)
35
+ // - PCP follows similar timing to NAT-PMP
36
+ // - 10 seconds covers common cases while failing fast when no NAT exists
37
+ const DiscoveryTimeout = 10 * time .Second
38
+
39
+ // rediscoveryThreshold is the number of consecutive connection failures
40
+ // before triggering NAT rediscovery. We ignore first few failures to
41
+ // distinguish between transient network issues and persistent router
42
+ // problems like restarts or port changes that require finding the NAT device again.
43
+ const rediscoveryThreshold = 3
44
+
29
45
type entry struct {
30
46
protocol string
31
47
port int
@@ -40,18 +56,15 @@ func DiscoverNAT(ctx context.Context) (*NAT, error) {
40
56
if err != nil {
41
57
return nil , err
42
58
}
43
- var extAddr netip.Addr
44
- extIP , err := natInstance .GetExternalAddress ()
45
- if err == nil {
46
- extAddr , _ = netip .AddrFromSlice (extIP )
47
- }
59
+
60
+ extAddr := getExternalAddress (natInstance )
48
61
49
62
// Log the device addr.
50
63
addr , err := natInstance .GetDeviceAddress ()
51
64
if err != nil {
52
- log .Debug ("DiscoverGateway address error" , "err" , err )
65
+ log .Warn ("DiscoverGateway address error" , "err" , err )
53
66
} else {
54
- log .Debug ("DiscoverGateway address" , "address" , addr )
67
+ log .Info ("DiscoverGateway address" , "address" , addr )
55
68
}
56
69
57
70
ctx , cancel := context .WithCancel (context .Background ())
@@ -74,19 +87,35 @@ func DiscoverNAT(ctx context.Context) (*NAT, error) {
74
87
// NATs (Network Address Translators). It is a long-running
75
88
// service that will periodically renew port mappings,
76
89
// and keep an up-to-date list of all the external addresses.
90
+ //
91
+ // Locking strategy:
92
+ // - natmu: Protects nat instance and rediscovery state (nat, consecutiveFailures, rediscovering)
93
+ // - mappingmu: Protects port mappings table and closed flag (mappings, closed)
94
+ // - Lock ordering: When both locks are needed, always acquire mappingmu before natmu
95
+ // to prevent deadlocks
96
+ // - We use separate mutexes because the NAT instance may change (e.g., when router
97
+ // restarts and UPnP port changes), but the port mappings must persist and be
98
+ // re-applied across all instances. This separation allows the mappings table to
99
+ // remain stable while the underlying NAT device changes.
77
100
type NAT struct {
78
101
natmu sync.Mutex
79
102
nat nat.NAT
103
+
104
+ // Track connection failures for auto-rediscovery
105
+ consecutiveFailures int // protected by natmu
106
+ rediscovering bool // protected by natmu
107
+
80
108
// External IP of the NAT. Will be renewed periodically (every CacheTime).
81
109
extAddr atomic.Pointer [netip.Addr ]
82
110
83
111
refCount sync.WaitGroup
84
112
ctx context.Context
85
113
ctxCancel context.CancelFunc
86
114
87
- mappingmu sync.RWMutex // guards mappings
88
- closed bool
89
- mappings map [entry ]int
115
+ // Port mappings that should persist across NAT instance changes
116
+ mappingmu sync.RWMutex
117
+ closed bool // protected by mappingmu
118
+ mappings map [entry ]int // protected by mappingmu
90
119
}
91
120
92
121
// Close shuts down all port mappings. NAT can no longer be used.
@@ -134,13 +163,21 @@ func (nat *NAT) AddMapping(ctx context.Context, protocol string, port int) error
134
163
return errors .New ("closed" )
135
164
}
136
165
166
+ // Check if the mapping already exists to avoid duplicate work
167
+ e := entry {protocol : protocol , port : port }
168
+ if _ , exists := nat .mappings [e ]; exists {
169
+ return nil
170
+ }
171
+
172
+ log .Info ("Starting maintenance of port mapping" , "protocol" , protocol , "port" , port )
173
+
137
174
// do it once synchronously, so first mapping is done right away, and before exiting,
138
175
// allowing users -- in the optimistic case -- to use results right after.
139
176
extPort := nat .establishMapping (ctx , protocol , port )
140
177
// Don't validate the mapping here, we refresh the mappings based on this map.
141
178
// We can try getting a port again in case it succeeds. In the worst case,
142
179
// this is one extra LAN request every few minutes.
143
- nat .mappings [entry { protocol : protocol , port : port } ] = extPort
180
+ nat .mappings [e ] = extPort
144
181
return nil
145
182
}
146
183
@@ -149,11 +186,14 @@ func (nat *NAT) AddMapping(ctx context.Context, protocol string, port int) error
149
186
func (nat * NAT ) RemoveMapping (ctx context.Context , protocol string , port int ) error {
150
187
nat .mappingmu .Lock ()
151
188
defer nat .mappingmu .Unlock ()
189
+ nat .natmu .Lock ()
190
+ defer nat .natmu .Unlock ()
152
191
153
192
switch protocol {
154
193
case "tcp" , "udp" :
155
194
e := entry {protocol : protocol , port : port }
156
195
if _ , ok := nat .mappings [e ]; ok {
196
+ log .Info ("Stopping maintenance of port mapping" , "protocol" , protocol , "port" , port )
157
197
delete (nat .mappings , e )
158
198
return nat .nat .DeletePortMapping (ctx , protocol , port )
159
199
}
@@ -164,6 +204,12 @@ func (nat *NAT) RemoveMapping(ctx context.Context, protocol string, port int) er
164
204
}
165
205
166
206
func (nat * NAT ) background () {
207
+ // Renew port mappings every 20 seconds (1/3 of 60s lifetime).
208
+ // - NAT-PMP RFC 6886 recommends renewing at 50% of lifetime
209
+ // - We use 33% for added safety against silent lifetime reductions
210
+ // NOTE: This aggressive 60s/20s pattern may be outdated for modern routers
211
+ // but provides quick cleanup and fast failure detection for our rediscovery.
212
+ // TODO: Research longer durations (e.g. 30min/10min) to reduce router load
167
213
const mappingUpdate = MappingDuration / 3
168
214
169
215
now := time .Now ()
@@ -202,8 +248,11 @@ func (nat *NAT) background() {
202
248
nextMappingUpdate = time .Now ().Add (mappingUpdate )
203
249
}
204
250
if now .After (nextAddrUpdate ) {
205
- var extAddr netip. Addr
251
+ nat . natmu . Lock ()
206
252
extIP , err := nat .nat .GetExternalAddress ()
253
+ nat .natmu .Unlock ()
254
+
255
+ var extAddr netip.Addr
207
256
if err == nil {
208
257
extAddr , _ = netip .AddrFromSlice (extIP )
209
258
}
@@ -213,13 +262,16 @@ func (nat *NAT) background() {
213
262
t .Reset (time .Until (minTime (nextAddrUpdate , nextMappingUpdate )))
214
263
case <- nat .ctx .Done ():
215
264
nat .mappingmu .Lock ()
265
+ defer nat .mappingmu .Unlock ()
266
+ nat .natmu .Lock ()
267
+ defer nat .natmu .Unlock ()
268
+
216
269
ctx , cancel := context .WithTimeout (context .Background (), 10 * time .Second )
217
270
defer cancel ()
218
271
for e := range nat .mappings {
219
- delete (nat .mappings , e )
220
272
nat .nat .DeletePortMapping (ctx , e .protocol , e .port )
221
273
}
222
- nat .mappingmu . Unlock ( )
274
+ clear ( nat .mappings )
223
275
return
224
276
}
225
277
}
@@ -229,28 +281,95 @@ func (nat *NAT) establishMapping(ctx context.Context, protocol string, internalP
229
281
log .Debug ("Attempting port map" , "protocol" , protocol , "internal_port" , internalPort )
230
282
const comment = "libp2p"
231
283
284
+ // Try to establish the mapping with both NAT calls under the same lock
232
285
nat .natmu .Lock ()
286
+ defer nat .natmu .Unlock ()
287
+
233
288
var err error
234
289
externalPort , err = nat .nat .AddPortMapping (ctx , protocol , internalPort , comment , MappingDuration )
235
290
if err != nil {
236
291
// Some hardware does not support mappings with timeout, so try that
237
292
externalPort , err = nat .nat .AddPortMapping (ctx , protocol , internalPort , comment , 0 )
238
293
}
239
- nat .natmu .Unlock ()
240
294
241
- if err != nil || externalPort == 0 {
242
- if err != nil {
243
- log .Warn ("NAT port mapping failed" , "protocol" , protocol , "internal_port" , internalPort , "err" , err )
295
+ // Handle success
296
+ if err == nil && externalPort != 0 {
297
+ nat .consecutiveFailures = 0
298
+ log .Debug ("NAT port mapping established" , "protocol" , protocol , "internal_port" , internalPort , "external_port" , externalPort )
299
+ return externalPort
300
+ }
301
+
302
+ // Handle failures
303
+ if err != nil {
304
+ log .Warn ("NAT port mapping failed" , "protocol" , protocol , "internal_port" , internalPort , "err" , err )
305
+
306
+ // Check if this is a connection error that might indicate router restart
307
+ // See: https://github.com/libp2p/go-libp2p/issues/3224#issuecomment-2866844723
308
+ // Note: We use string matching because goupnp doesn't preserve error chains (uses %v instead of %w)
309
+ if strings .Contains (err .Error (), "connection refused" ) {
310
+ nat .consecutiveFailures ++
311
+ if nat .consecutiveFailures >= rediscoveryThreshold && ! nat .rediscovering {
312
+ nat .rediscovering = true
313
+ // Spawn in goroutine to avoid blocking the caller while we
314
+ // perform network discovery, which can take up to 30 seconds.
315
+ // The rediscovering flag prevents multiple concurrent attempts.
316
+ go nat .rediscoverNAT ()
317
+ }
244
318
} else {
245
- log .Warn ("NAT port mapping failed" , "protocol" , protocol , "internal_port" , internalPort , "external_port" , 0 )
319
+ // Reset counter for non-connection errors (transient failures)
320
+ nat .consecutiveFailures = 0
246
321
}
247
- // we do not close if the mapping failed,
248
- // because it may work again next time.
249
322
return 0
250
323
}
251
324
252
- log .Debug ("NAT Mapping" , "external_port" , externalPort , "internal_port" , internalPort , "protocol" , protocol )
253
- return externalPort
325
+ // externalPort is 0 but no error was returned
326
+ log .Warn ("NAT port mapping failed" , "protocol" , protocol , "internal_port" , internalPort , "external_port" , 0 )
327
+ return 0
328
+ }
329
+
330
+ // rediscoverNAT attempts to rediscover the NAT device after connection failures
331
+ func (nat * NAT ) rediscoverNAT () {
332
+ log .Info ("NAT rediscovery triggered due to repeated connection failures" )
333
+
334
+ ctx , cancel := context .WithTimeout (nat .ctx , DiscoveryTimeout )
335
+ defer cancel ()
336
+
337
+ newNATInstance , err := discoverGateway (ctx )
338
+ if err != nil {
339
+ log .Warn ("NAT rediscovery failed" , "err" , err )
340
+ nat .natmu .Lock ()
341
+ defer nat .natmu .Unlock ()
342
+ nat .rediscovering = false
343
+ return
344
+ }
345
+
346
+ extAddr := getExternalAddress (newNATInstance )
347
+
348
+ // Replace the NAT instance
349
+ // No cleanup of the old instance needed because:
350
+ // - Router restart has already wiped all mappings
351
+ // - Old UPnP endpoint is dead (connection refused)
352
+ // - If router didn't actually restart (false positive), any stale mappings
353
+ // on the router expire naturally (60 second UPnP timeout)
354
+ nat .natmu .Lock ()
355
+ nat .nat = newNATInstance
356
+ nat .extAddr .Store (& extAddr )
357
+ nat .consecutiveFailures = 0
358
+ nat .rediscovering = false
359
+ nat .natmu .Unlock ()
360
+
361
+ // Re-establish all existing mappings on the new NAT instance
362
+ nat .mappingmu .Lock ()
363
+ for e := range nat .mappings {
364
+ extPort := nat .establishMapping (nat .ctx , e .protocol , e .port )
365
+ nat .mappings [e ] = extPort
366
+ if extPort != 0 {
367
+ log .Info ("NAT mapping restored after rediscovery" , "protocol" , e .protocol , "internal_port" , e .port , "external_port" , extPort )
368
+ }
369
+ }
370
+ nat .mappingmu .Unlock ()
371
+
372
+ log .Info ("NAT rediscovery successful" )
254
373
}
255
374
256
375
func minTime (a , b time.Time ) time.Time {
@@ -259,3 +378,18 @@ func minTime(a, b time.Time) time.Time {
259
378
}
260
379
return b
261
380
}
381
+
382
+ // getExternalAddress retrieves and parses the external address from a NAT instance
383
+ func getExternalAddress (natInstance nat.NAT ) netip.Addr {
384
+ extIP , err := natInstance .GetExternalAddress ()
385
+ if err != nil {
386
+ log .Debug ("Failed to get external address" , "err" , err )
387
+ return netip.Addr {}
388
+ }
389
+ extAddr , ok := netip .AddrFromSlice (extIP )
390
+ if ! ok {
391
+ log .Debug ("Failed to parse external address" , "ip" , extIP )
392
+ return netip.Addr {}
393
+ }
394
+ return extAddr
395
+ }
0 commit comments