Skip to content

Commit 8b79877

Browse files
lidelMarcoPolo
authored andcommitted
fix: heal NAT mappings after router restart
Router restarts can cause UPnP/NAT-PMP services to change their listening ports, leading to connection refused errors. This fix implements automatic NAT rediscovery when consecutive connection failures are detected, restoring all existing port mappings on the new NAT instance. See #3224 (comment) for details on the router behavior that motivated this fix.
1 parent 95cad50 commit 8b79877

File tree

3 files changed

+475
-27
lines changed

3 files changed

+475
-27
lines changed

p2p/host/basic/natmgr.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ import (
77
"net/netip"
88
"strconv"
99
"sync"
10-
"time"
1110

1211
"github.com/libp2p/go-libp2p/core/network"
1312
inat "github.com/libp2p/go-libp2p/p2p/net/nat"
@@ -105,7 +104,7 @@ func (nmgr *natManager) background(ctx context.Context) {
105104
}
106105
}()
107106

108-
discoverCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
107+
discoverCtx, cancel := context.WithTimeout(ctx, inat.DiscoveryTimeout)
109108
defer cancel()
110109
natInstance, err := discoverNAT(discoverCtx)
111110
if err != nil {

p2p/net/nat/nat.go

Lines changed: 148 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"errors"
66
"fmt"
77
"net/netip"
8+
"strings"
89
"sync"
910
"sync/atomic"
1011
"time"
@@ -26,6 +27,21 @@ const MappingDuration = time.Minute
2627
// CacheTime is the time a mapping will cache an external address for
2728
const CacheTime = 15 * time.Second
2829

30+
// DiscoveryTimeout is the maximum time to wait for NAT discovery.
31+
// This is based on the underlying UPnP and NAT-PMP/PCP protocols:
32+
// - SSDP (UPnP discovery) waits 5 seconds for responses
33+
// - NAT-PMP uses exponential backoff starting at 250ms, up to 9 retries
34+
// (total ~32 seconds if exhausted, but typically responds in 1-2 seconds)
35+
// - PCP follows similar timing to NAT-PMP
36+
// - 10 seconds covers common cases while failing fast when no NAT exists
37+
const DiscoveryTimeout = 10 * time.Second
38+
39+
// rediscoveryThreshold is the number of consecutive connection failures
40+
// before triggering NAT rediscovery. We ignore first few failures to
41+
// distinguish between transient network issues and persistent router
42+
// problems like restarts or port changes that require finding the NAT device again.
43+
const rediscoveryThreshold = 3
44+
2945
type entry struct {
3046
protocol string
3147
port int
@@ -40,18 +56,15 @@ func DiscoverNAT(ctx context.Context) (*NAT, error) {
4056
if err != nil {
4157
return nil, err
4258
}
43-
var extAddr netip.Addr
44-
extIP, err := natInstance.GetExternalAddress()
45-
if err == nil {
46-
extAddr, _ = netip.AddrFromSlice(extIP)
47-
}
59+
60+
extAddr := getExternalAddress(natInstance)
4861

4962
// Log the device addr.
5063
addr, err := natInstance.GetDeviceAddress()
5164
if err != nil {
52-
log.Debug("DiscoverGateway address error", "err", err)
65+
log.Warn("DiscoverGateway address error", "err", err)
5366
} else {
54-
log.Debug("DiscoverGateway address", "address", addr)
67+
log.Info("DiscoverGateway address", "address", addr)
5568
}
5669

5770
ctx, cancel := context.WithCancel(context.Background())
@@ -74,19 +87,35 @@ func DiscoverNAT(ctx context.Context) (*NAT, error) {
7487
// NATs (Network Address Translators). It is a long-running
7588
// service that will periodically renew port mappings,
7689
// and keep an up-to-date list of all the external addresses.
90+
//
91+
// Locking strategy:
92+
// - natmu: Protects nat instance and rediscovery state (nat, consecutiveFailures, rediscovering)
93+
// - mappingmu: Protects port mappings table and closed flag (mappings, closed)
94+
// - Lock ordering: When both locks are needed, always acquire mappingmu before natmu
95+
// to prevent deadlocks
96+
// - We use separate mutexes because the NAT instance may change (e.g., when router
97+
// restarts and UPnP port changes), but the port mappings must persist and be
98+
// re-applied across all instances. This separation allows the mappings table to
99+
// remain stable while the underlying NAT device changes.
77100
type NAT struct {
78101
natmu sync.Mutex
79102
nat nat.NAT
103+
104+
// Track connection failures for auto-rediscovery
105+
consecutiveFailures int // protected by natmu
106+
rediscovering bool // protected by natmu
107+
80108
// External IP of the NAT. Will be renewed periodically (every CacheTime).
81109
extAddr atomic.Pointer[netip.Addr]
82110

83111
refCount sync.WaitGroup
84112
ctx context.Context
85113
ctxCancel context.CancelFunc
86114

87-
mappingmu sync.RWMutex // guards mappings
88-
closed bool
89-
mappings map[entry]int
115+
// Port mappings that should persist across NAT instance changes
116+
mappingmu sync.RWMutex
117+
closed bool // protected by mappingmu
118+
mappings map[entry]int // protected by mappingmu
90119
}
91120

92121
// Close shuts down all port mappings. NAT can no longer be used.
@@ -149,11 +178,14 @@ func (nat *NAT) AddMapping(ctx context.Context, protocol string, port int) error
149178
func (nat *NAT) RemoveMapping(ctx context.Context, protocol string, port int) error {
150179
nat.mappingmu.Lock()
151180
defer nat.mappingmu.Unlock()
181+
nat.natmu.Lock()
182+
defer nat.natmu.Unlock()
152183

153184
switch protocol {
154185
case "tcp", "udp":
155186
e := entry{protocol: protocol, port: port}
156187
if _, ok := nat.mappings[e]; ok {
188+
log.Info("Stopping maintenance of port mapping", "protocol", protocol, "port", port)
157189
delete(nat.mappings, e)
158190
return nat.nat.DeletePortMapping(ctx, protocol, port)
159191
}
@@ -164,6 +196,12 @@ func (nat *NAT) RemoveMapping(ctx context.Context, protocol string, port int) er
164196
}
165197

166198
func (nat *NAT) background() {
199+
// Renew port mappings every 20 seconds (1/3 of 60s lifetime).
200+
// - NAT-PMP RFC 6886 recommends renewing at 50% of lifetime
201+
// - We use 33% for added safety against silent lifetime reductions
202+
// NOTE: This aggressive 60s/20s pattern may be outdated for modern routers
203+
// but provides quick cleanup and fast failure detection for our rediscovery.
204+
// TODO: Research longer durations (e.g. 30min/10min) to reduce router load
167205
const mappingUpdate = MappingDuration / 3
168206

169207
now := time.Now()
@@ -202,8 +240,11 @@ func (nat *NAT) background() {
202240
nextMappingUpdate = time.Now().Add(mappingUpdate)
203241
}
204242
if now.After(nextAddrUpdate) {
205-
var extAddr netip.Addr
243+
nat.natmu.Lock()
206244
extIP, err := nat.nat.GetExternalAddress()
245+
nat.natmu.Unlock()
246+
247+
var extAddr netip.Addr
207248
if err == nil {
208249
extAddr, _ = netip.AddrFromSlice(extIP)
209250
}
@@ -213,13 +254,16 @@ func (nat *NAT) background() {
213254
t.Reset(time.Until(minTime(nextAddrUpdate, nextMappingUpdate)))
214255
case <-nat.ctx.Done():
215256
nat.mappingmu.Lock()
257+
defer nat.mappingmu.Unlock()
258+
nat.natmu.Lock()
259+
defer nat.natmu.Unlock()
260+
216261
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
217262
defer cancel()
218263
for e := range nat.mappings {
219-
delete(nat.mappings, e)
220264
nat.nat.DeletePortMapping(ctx, e.protocol, e.port)
221265
}
222-
nat.mappingmu.Unlock()
266+
clear(nat.mappings)
223267
return
224268
}
225269
}
@@ -229,28 +273,95 @@ func (nat *NAT) establishMapping(ctx context.Context, protocol string, internalP
229273
log.Debug("Attempting port map", "protocol", protocol, "internal_port", internalPort)
230274
const comment = "libp2p"
231275

276+
// Try to establish the mapping with both NAT calls under the same lock
232277
nat.natmu.Lock()
278+
defer nat.natmu.Unlock()
279+
233280
var err error
234281
externalPort, err = nat.nat.AddPortMapping(ctx, protocol, internalPort, comment, MappingDuration)
235282
if err != nil {
236283
// Some hardware does not support mappings with timeout, so try that
237284
externalPort, err = nat.nat.AddPortMapping(ctx, protocol, internalPort, comment, 0)
238285
}
239-
nat.natmu.Unlock()
240286

241-
if err != nil || externalPort == 0 {
242-
if err != nil {
243-
log.Warn("NAT port mapping failed", "protocol", protocol, "internal_port", internalPort, "err", err)
287+
// Handle success
288+
if err == nil && externalPort != 0 {
289+
nat.consecutiveFailures = 0
290+
log.Debug("NAT port mapping established", "protocol", protocol, "internal_port", internalPort, "external_port", externalPort)
291+
return externalPort
292+
}
293+
294+
// Handle failures
295+
if err != nil {
296+
log.Warn("NAT port mapping failed", "protocol", protocol, "internal_port", internalPort, "err", err)
297+
298+
// Check if this is a connection error that might indicate router restart
299+
// See: https://github.com/libp2p/go-libp2p/issues/3224#issuecomment-2866844723
300+
// Note: We use string matching because goupnp doesn't preserve error chains (uses %v instead of %w)
301+
if strings.Contains(err.Error(), "connection refused") {
302+
nat.consecutiveFailures++
303+
if nat.consecutiveFailures >= rediscoveryThreshold && !nat.rediscovering {
304+
nat.rediscovering = true
305+
// Spawn in goroutine to avoid blocking the caller while we
306+
// perform network discovery, which can take up to 30 seconds.
307+
// The rediscovering flag prevents multiple concurrent attempts.
308+
go nat.rediscoverNAT()
309+
}
244310
} else {
245-
log.Warn("NAT port mapping failed", "protocol", protocol, "internal_port", internalPort, "external_port", 0)
311+
// Reset counter for non-connection errors (transient failures)
312+
nat.consecutiveFailures = 0
246313
}
247-
// we do not close if the mapping failed,
248-
// because it may work again next time.
249314
return 0
250315
}
251316

252-
log.Debug("NAT Mapping", "external_port", externalPort, "internal_port", internalPort, "protocol", protocol)
253-
return externalPort
317+
// externalPort is 0 but no error was returned
318+
log.Warn("NAT port mapping failed", "protocol", protocol, "internal_port", internalPort, "external_port", 0)
319+
return 0
320+
}
321+
322+
// rediscoverNAT attempts to rediscover the NAT device after connection failures
323+
func (nat *NAT) rediscoverNAT() {
324+
log.Info("NAT rediscovery triggered due to repeated connection failures")
325+
326+
ctx, cancel := context.WithTimeout(nat.ctx, DiscoveryTimeout)
327+
defer cancel()
328+
329+
newNATInstance, err := discoverGateway(ctx)
330+
if err != nil {
331+
log.Warn("NAT rediscovery failed", "err", err)
332+
nat.natmu.Lock()
333+
defer nat.natmu.Unlock()
334+
nat.rediscovering = false
335+
return
336+
}
337+
338+
extAddr := getExternalAddress(newNATInstance)
339+
340+
// Replace the NAT instance
341+
// No cleanup of the old instance needed because:
342+
// - Router restart has already wiped all mappings
343+
// - Old UPnP endpoint is dead (connection refused)
344+
// - If router didn't actually restart (false positive), any stale mappings
345+
// on the router expire naturally (60 second UPnP timeout)
346+
nat.natmu.Lock()
347+
nat.nat = newNATInstance
348+
nat.extAddr.Store(&extAddr)
349+
nat.consecutiveFailures = 0
350+
nat.rediscovering = false
351+
nat.natmu.Unlock()
352+
353+
// Re-establish all existing mappings on the new NAT instance
354+
nat.mappingmu.Lock()
355+
for e := range nat.mappings {
356+
extPort := nat.establishMapping(nat.ctx, e.protocol, e.port)
357+
nat.mappings[e] = extPort
358+
if extPort != 0 {
359+
log.Info("NAT mapping restored after rediscovery", "protocol", e.protocol, "internal_port", e.port, "external_port", extPort)
360+
}
361+
}
362+
nat.mappingmu.Unlock()
363+
364+
log.Info("NAT rediscovery successful")
254365
}
255366

256367
func minTime(a, b time.Time) time.Time {
@@ -259,3 +370,18 @@ func minTime(a, b time.Time) time.Time {
259370
}
260371
return b
261372
}
373+
374+
// getExternalAddress retrieves and parses the external address from a NAT instance
375+
func getExternalAddress(natInstance nat.NAT) netip.Addr {
376+
extIP, err := natInstance.GetExternalAddress()
377+
if err != nil {
378+
log.Debug("Failed to get external address", "err", err)
379+
return netip.Addr{}
380+
}
381+
extAddr, ok := netip.AddrFromSlice(extIP)
382+
if !ok {
383+
log.Debug("Failed to parse external address", "ip", extIP)
384+
return netip.Addr{}
385+
}
386+
return extAddr
387+
}

0 commit comments

Comments
 (0)