Skip to content

Commit ee734d1

Browse files
authored
Fix stuck data column lookups by improving peer selection and retry logic (#8005)
Fixes the issue described in #7980 where Lighthouse repeatedly sends `DataColumnsByRoot` requests to the same peers that return empty responses, causing sync to get stuck. The root cause was we don't count empty responses as failures, leading to excessive retries to unresponsive peers. - Track per peer attempts to limit retry attempts per peer (`MAX_CUSTODY_PEER_ATTEMPTS = 3`) - Replaced random peer selection with hashing within each lookup to prevent splitting lookup into too many small requests and improve request batching efficiency. - Added `single_block_lookup` root span to track all lookups created and added more debug logs: <img width="1264" height="501" alt="image" src="https://github.com/user-attachments/assets/983629ba-b6d0-41cf-8e93-88a5b96c2f31" /> Co-Authored-By: Jimmy Chen <[email protected]> Co-Authored-By: Jimmy Chen <[email protected]>
1 parent 8ec2640 commit ee734d1

File tree

6 files changed

+117
-67
lines changed

6 files changed

+117
-67
lines changed

beacon_node/lighthouse_tracing/src/lib.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ pub const SPAN_PROCESS_GOSSIP_BLOCK: &str = "process_gossip_block";
1717
/// Sync methods root spans
1818
pub const SPAN_SYNCING_CHAIN: &str = "syncing_chain";
1919
pub const SPAN_OUTGOING_RANGE_REQUEST: &str = "outgoing_range_request";
20+
pub const SPAN_SINGLE_BLOCK_LOOKUP: &str = "single_block_lookup";
21+
pub const SPAN_OUTGOING_BLOCK_BY_ROOT_REQUEST: &str = "outgoing_block_by_root_request";
2022
pub const SPAN_OUTGOING_CUSTODY_REQUEST: &str = "outgoing_custody_request";
2123
pub const SPAN_PROCESS_RPC_BLOCK: &str = "process_rpc_block";
2224
pub const SPAN_PROCESS_RPC_BLOBS: &str = "process_rpc_blobs";
@@ -46,7 +48,7 @@ pub const LH_BN_ROOT_SPAN_NAMES: &[&str] = &[
4648
SPAN_PROCESS_GOSSIP_BLOB,
4749
SPAN_PROCESS_GOSSIP_BLOCK,
4850
SPAN_OUTGOING_RANGE_REQUEST,
49-
SPAN_OUTGOING_CUSTODY_REQUEST,
51+
SPAN_SINGLE_BLOCK_LOOKUP,
5052
SPAN_PROCESS_RPC_BLOCK,
5153
SPAN_PROCESS_RPC_BLOBS,
5254
SPAN_PROCESS_RPC_CUSTODY_COLUMNS,

beacon_node/network/src/sync/block_lookups/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -384,6 +384,7 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
384384
// If we know that this lookup has unknown parent (is awaiting a parent lookup to resolve),
385385
// signal here to hold processing downloaded data.
386386
let mut lookup = SingleBlockLookup::new(block_root, peers, cx.next_id(), awaiting_parent);
387+
let _guard = lookup.span.clone().entered();
387388

388389
// Add block components to the new request
389390
if let Some(block_component) = block_component {

beacon_node/network/src/sync/block_lookups/single_block_lookup.rs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,15 @@ use crate::sync::network_context::{
77
use beacon_chain::{BeaconChainTypes, BlockProcessStatus};
88
use derivative::Derivative;
99
use lighthouse_network::service::api_types::Id;
10+
use lighthouse_tracing::SPAN_SINGLE_BLOCK_LOOKUP;
1011
use parking_lot::RwLock;
1112
use std::collections::HashSet;
1213
use std::fmt::Debug;
1314
use std::sync::Arc;
1415
use std::time::{Duration, Instant};
1516
use store::Hash256;
1617
use strum::IntoStaticStr;
18+
use tracing::{Span, debug_span};
1719
use types::blob_sidecar::FixedBlobSidecarList;
1820
use types::{DataColumnSidecarList, EthSpec, SignedBeaconBlock, Slot};
1921

@@ -70,6 +72,7 @@ pub struct SingleBlockLookup<T: BeaconChainTypes> {
7072
block_root: Hash256,
7173
awaiting_parent: Option<Hash256>,
7274
created: Instant,
75+
pub(crate) span: Span,
7376
}
7477

7578
#[derive(Debug)]
@@ -89,6 +92,12 @@ impl<T: BeaconChainTypes> SingleBlockLookup<T> {
8992
id: Id,
9093
awaiting_parent: Option<Hash256>,
9194
) -> Self {
95+
let lookup_span = debug_span!(
96+
SPAN_SINGLE_BLOCK_LOOKUP,
97+
block_root = %requested_block_root,
98+
id = id,
99+
);
100+
92101
Self {
93102
id,
94103
block_request_state: BlockRequestState::new(requested_block_root),
@@ -97,6 +106,7 @@ impl<T: BeaconChainTypes> SingleBlockLookup<T> {
97106
block_root: requested_block_root,
98107
awaiting_parent,
99108
created: Instant::now(),
109+
span: lookup_span,
100110
}
101111
}
102112

@@ -192,6 +202,7 @@ impl<T: BeaconChainTypes> SingleBlockLookup<T> {
192202
&mut self,
193203
cx: &mut SyncNetworkContext<T>,
194204
) -> Result<LookupResult, LookupRequestError> {
205+
let _guard = self.span.clone().entered();
195206
// TODO: Check what's necessary to download, specially for blobs
196207
self.continue_request::<BlockRequestState<T::EthSpec>>(cx, 0)?;
197208

@@ -257,6 +268,7 @@ impl<T: BeaconChainTypes> SingleBlockLookup<T> {
257268
// that can make progress so it must be dropped. Consider the lookup completed.
258269
// This case can happen if we receive the components from gossip during a retry.
259270
if self.all_components_processed() {
271+
self.span = Span::none();
260272
Ok(LookupResult::Completed)
261273
} else {
262274
Ok(LookupResult::Pending)

beacon_node/network/src/sync/network_context.rs

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ use lighthouse_network::service::api_types::{
2929
DataColumnsByRootRequester, Id, SingleLookupReqId, SyncRequestId,
3030
};
3131
use lighthouse_network::{Client, NetworkGlobals, PeerAction, PeerId, ReportSource};
32-
use lighthouse_tracing::SPAN_OUTGOING_RANGE_REQUEST;
32+
use lighthouse_tracing::{SPAN_OUTGOING_BLOCK_BY_ROOT_REQUEST, SPAN_OUTGOING_RANGE_REQUEST};
3333
use parking_lot::RwLock;
3434
pub use requests::LookupVerifyError;
3535
use requests::{
@@ -886,15 +886,19 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
886886
"Sync RPC request sent"
887887
);
888888

889+
let request_span = debug_span!(
890+
parent: Span::current(),
891+
SPAN_OUTGOING_BLOCK_BY_ROOT_REQUEST,
892+
%block_root,
893+
);
889894
self.blocks_by_root_requests.insert(
890895
id,
891896
peer_id,
892897
// true = enforce max_requests as returned for blocks_by_root. We always request a single
893898
// block and the peer must have it.
894899
true,
895900
BlocksByRootRequestItems::new(request),
896-
// Not implemented
897-
Span::none(),
901+
request_span,
898902
);
899903

900904
Ok(LookupRequestResult::RequestSent(id.req_id))

beacon_node/network/src/sync/network_context/custody.rs

Lines changed: 91 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -7,19 +7,17 @@ use fnv::FnvHashMap;
77
use lighthouse_network::PeerId;
88
use lighthouse_network::service::api_types::{CustodyId, DataColumnsByRootRequester};
99
use lighthouse_tracing::SPAN_OUTGOING_CUSTODY_REQUEST;
10-
use lru_cache::LRUTimeCache;
1110
use parking_lot::RwLock;
12-
use rand::Rng;
1311
use std::collections::HashSet;
12+
use std::hash::{BuildHasher, RandomState};
1413
use std::time::{Duration, Instant};
1514
use std::{collections::HashMap, marker::PhantomData, sync::Arc};
16-
use tracing::{Span, debug, debug_span, field, warn};
15+
use tracing::{Span, debug, debug_span, warn};
1716
use types::{DataColumnSidecar, Hash256, data_column_sidecar::ColumnIndex};
1817
use types::{DataColumnSidecarList, EthSpec};
1918

2019
use super::{LookupRequestResult, PeerGroup, RpcResponseResult, SyncNetworkContext};
2120

22-
const FAILED_PEERS_CACHE_EXPIRY_SECONDS: u64 = 5;
2321
const MAX_STALE_NO_PEERS_DURATION: Duration = Duration::from_secs(30);
2422

2523
pub struct ActiveCustodyRequest<T: BeaconChainTypes> {
@@ -30,9 +28,7 @@ pub struct ActiveCustodyRequest<T: BeaconChainTypes> {
3028
/// Active requests for 1 or more columns each
3129
active_batch_columns_requests:
3230
FnvHashMap<DataColumnsByRootRequestId, ActiveBatchColumnsRequest>,
33-
/// Peers that have recently failed to successfully respond to a columns by root request.
34-
/// Having a LRUTimeCache allows this request to not have to track disconnecting peers.
35-
failed_peers: LRUTimeCache<PeerId>,
31+
peer_attempts: HashMap<PeerId, usize>,
3632
/// Set of peers that claim to have imported this block and their custody columns
3733
lookup_peers: Arc<RwLock<HashSet<PeerId>>>,
3834
/// Span for tracing the lifetime of this request.
@@ -71,7 +67,11 @@ impl<T: BeaconChainTypes> ActiveCustodyRequest<T> {
7167
column_indices: &[ColumnIndex],
7268
lookup_peers: Arc<RwLock<HashSet<PeerId>>>,
7369
) -> Self {
74-
let span = debug_span!(parent: None, SPAN_OUTGOING_CUSTODY_REQUEST, %block_root);
70+
let span = debug_span!(
71+
parent: Span::current(),
72+
SPAN_OUTGOING_CUSTODY_REQUEST,
73+
%block_root,
74+
);
7575
Self {
7676
block_root,
7777
custody_id,
@@ -81,7 +81,7 @@ impl<T: BeaconChainTypes> ActiveCustodyRequest<T> {
8181
.map(|index| (*index, ColumnRequest::new())),
8282
),
8383
active_batch_columns_requests: <_>::default(),
84-
failed_peers: LRUTimeCache::new(Duration::from_secs(FAILED_PEERS_CACHE_EXPIRY_SECONDS)),
84+
peer_attempts: HashMap::new(),
8585
lookup_peers,
8686
span,
8787
_phantom: PhantomData,
@@ -170,13 +170,6 @@ impl<T: BeaconChainTypes> ActiveCustodyRequest<T> {
170170
?missing_column_indexes,
171171
"Custody column peer claims to not have some data"
172172
);
173-
174-
batch_request.span.record(
175-
"missing_column_indexes",
176-
field::debug(missing_column_indexes),
177-
);
178-
179-
self.failed_peers.insert(peer_id);
180173
}
181174
}
182175
Err(err) => {
@@ -195,13 +188,6 @@ impl<T: BeaconChainTypes> ActiveCustodyRequest<T> {
195188
.ok_or(Error::BadState("unknown column_index".to_owned()))?
196189
.on_download_error_and_mark_failure(req_id)?;
197190
}
198-
199-
batch_request.span.record(
200-
"missing_column_indexes",
201-
field::debug(&batch_request.indices),
202-
);
203-
204-
self.failed_peers.insert(peer_id);
205191
}
206192
};
207193

@@ -238,52 +224,29 @@ impl<T: BeaconChainTypes> ActiveCustodyRequest<T> {
238224
let active_request_count_by_peer = cx.active_request_count_by_peer();
239225
let mut columns_to_request_by_peer = HashMap::<PeerId, Vec<ColumnIndex>>::new();
240226
let lookup_peers = self.lookup_peers.read();
227+
// Create deterministic hasher per request to ensure consistent peer ordering within
228+
// this request (avoiding fragmentation) while varying selection across different requests
229+
let random_state = RandomState::new();
241230

242-
// Need to:
243-
// - track how many active requests a peer has for load balancing
244-
// - which peers have failures to attempt others
245-
// - which peer returned what to have PeerGroup attributability
246-
247-
for (column_index, request) in self.column_requests.iter_mut() {
231+
for (column_index, request) in self.column_requests.iter() {
248232
if let Some(wait_duration) = request.is_awaiting_download() {
233+
// Note: an empty response is considered a successful response, so we may end up
234+
// retrying many more times than `MAX_CUSTODY_COLUMN_DOWNLOAD_ATTEMPTS`.
249235
if request.download_failures > MAX_CUSTODY_COLUMN_DOWNLOAD_ATTEMPTS {
250236
return Err(Error::TooManyFailures);
251237
}
252238

253-
// TODO(das): When is a fork and only a subset of your peers know about a block, we should
254-
// only query the peers on that fork. Should this case be handled? How to handle it?
255-
let custodial_peers = cx.get_custodial_peers(*column_index);
239+
let peer_to_request = self.select_column_peer(
240+
cx,
241+
&active_request_count_by_peer,
242+
&lookup_peers,
243+
*column_index,
244+
&random_state,
245+
);
256246

257-
// We draw from the total set of peers, but prioritize those peers who we have
258-
// received an attestation / status / block message claiming to have imported the
259-
// lookup. The frequency of those messages is low, so drawing only from lookup_peers
260-
// could cause many lookups to take much longer or fail as they don't have enough
261-
// custody peers on a given column
262-
let mut priorized_peers = custodial_peers
263-
.iter()
264-
.map(|peer| {
265-
(
266-
// Prioritize peers that claim to know have imported this block
267-
if lookup_peers.contains(peer) { 0 } else { 1 },
268-
// De-prioritize peers that have failed to successfully respond to
269-
// requests recently
270-
self.failed_peers.contains(peer),
271-
// Prefer peers with fewer requests to load balance across peers.
272-
// We batch requests to the same peer, so count existence in the
273-
// `columns_to_request_by_peer` as a single 1 request.
274-
active_request_count_by_peer.get(peer).copied().unwrap_or(0)
275-
+ columns_to_request_by_peer.get(peer).map(|_| 1).unwrap_or(0),
276-
// Random factor to break ties, otherwise the PeerID breaks ties
277-
rand::rng().random::<u32>(),
278-
*peer,
279-
)
280-
})
281-
.collect::<Vec<_>>();
282-
priorized_peers.sort_unstable();
283-
284-
if let Some((_, _, _, _, peer_id)) = priorized_peers.first() {
247+
if let Some(peer_id) = peer_to_request {
285248
columns_to_request_by_peer
286-
.entry(*peer_id)
249+
.entry(peer_id)
287250
.or_default()
288251
.push(*column_index);
289252
} else if wait_duration > MAX_STALE_NO_PEERS_DURATION {
@@ -298,6 +261,23 @@ impl<T: BeaconChainTypes> ActiveCustodyRequest<T> {
298261
}
299262
}
300263

264+
let peer_requests = columns_to_request_by_peer.len();
265+
if peer_requests > 0 {
266+
let columns_requested_count = columns_to_request_by_peer
267+
.values()
268+
.map(|v| v.len())
269+
.sum::<usize>();
270+
debug!(
271+
lookup_peers = lookup_peers.len(),
272+
"Requesting {} columns from {} peers", columns_requested_count, peer_requests,
273+
);
274+
} else {
275+
debug!(
276+
lookup_peers = lookup_peers.len(),
277+
"No column peers found for look up",
278+
);
279+
}
280+
301281
for (peer_id, indices) in columns_to_request_by_peer.into_iter() {
302282
let request_result = cx
303283
.data_column_lookup_request(
@@ -317,8 +297,14 @@ impl<T: BeaconChainTypes> ActiveCustodyRequest<T> {
317297

318298
match request_result {
319299
LookupRequestResult::RequestSent(req_id) => {
300+
*self.peer_attempts.entry(peer_id).or_insert(0) += 1;
301+
320302
let client = cx.network_globals().client(&peer_id).kind;
321-
let batch_columns_req_span = debug_span!("batch_columns_req", %peer_id, %client, missing_column_indexes = tracing::field::Empty);
303+
let batch_columns_req_span = debug_span!(
304+
"batch_columns_req",
305+
%peer_id,
306+
%client,
307+
);
322308
let _guard = batch_columns_req_span.clone().entered();
323309
for column_index in &indices {
324310
let column_request = self
@@ -345,11 +331,54 @@ impl<T: BeaconChainTypes> ActiveCustodyRequest<T> {
345331

346332
Ok(None)
347333
}
334+
335+
fn select_column_peer(
336+
&self,
337+
cx: &mut SyncNetworkContext<T>,
338+
active_request_count_by_peer: &HashMap<PeerId, usize>,
339+
lookup_peers: &HashSet<PeerId>,
340+
column_index: ColumnIndex,
341+
random_state: &RandomState,
342+
) -> Option<PeerId> {
343+
// We draw from the total set of peers, but prioritize those peers who we have
344+
// received an attestation or a block from (`lookup_peers`), as the `lookup_peers` may take
345+
// time to build up and we are likely to not find any column peers initially.
346+
let custodial_peers = cx.get_custodial_peers(column_index);
347+
let mut prioritized_peers = custodial_peers
348+
.iter()
349+
.filter(|peer| {
350+
// Exclude peers that we have already made too many attempts to.
351+
self.peer_attempts.get(peer).copied().unwrap_or(0) <= MAX_CUSTODY_PEER_ATTEMPTS
352+
})
353+
.map(|peer| {
354+
(
355+
// Prioritize peers that claim to know have imported this block
356+
if lookup_peers.contains(peer) { 0 } else { 1 },
357+
// De-prioritize peers that we have already attempted to download from
358+
self.peer_attempts.get(peer).copied().unwrap_or(0),
359+
// Prefer peers with fewer requests to load balance across peers.
360+
active_request_count_by_peer.get(peer).copied().unwrap_or(0),
361+
// The hash ensures consistent peer ordering within this request
362+
// to avoid fragmentation while varying selection across different requests.
363+
random_state.hash_one(peer),
364+
*peer,
365+
)
366+
})
367+
.collect::<Vec<_>>();
368+
prioritized_peers.sort_unstable();
369+
370+
prioritized_peers
371+
.first()
372+
.map(|(_, _, _, _, peer_id)| *peer_id)
373+
}
348374
}
349375

350376
/// TODO(das): this attempt count is nested into the existing lookup request count.
351377
const MAX_CUSTODY_COLUMN_DOWNLOAD_ATTEMPTS: usize = 3;
352378

379+
/// Max number of attempts to request custody columns from a single peer.
380+
const MAX_CUSTODY_PEER_ATTEMPTS: usize = 3;
381+
353382
struct ColumnRequest<E: EthSpec> {
354383
status: Status<E>,
355384
download_failures: usize,

consensus/types/src/data_column_subnet_id.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,15 @@
11
//! Identifies each data column subnet by an integer identifier.
22
use crate::ChainSpec;
33
use crate::data_column_sidecar::ColumnIndex;
4+
use derivative::Derivative;
45
use safe_arith::{ArithError, SafeArith};
56
use serde::{Deserialize, Serialize};
67
use std::fmt::{self, Display};
78
use std::ops::{Deref, DerefMut};
89

910
#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
10-
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
11+
#[derive(Clone, Copy, Derivative, PartialEq, Eq, Hash, Serialize, Deserialize)]
12+
#[derivative(Debug = "transparent")]
1113
#[serde(transparent)]
1214
pub struct DataColumnSubnetId(#[serde(with = "serde_utils::quoted_u64")] u64);
1315

0 commit comments

Comments
 (0)