Skip to content

Commit 397ef24

Browse files
committed
Update EEPROM access retry mechanism to retry only on I2C errors reported by the kernel
1 parent 65b2cc1 commit 397ef24

File tree

2 files changed

+110
-43
lines changed

2 files changed

+110
-43
lines changed

platform/mellanox/mlnx-platform-api/sonic_platform/sfp.py

Lines changed: 109 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
import os
3131
import threading
3232
import time
33+
import errno
3334
from sonic_py_common.logger import Logger
3435
from sonic_py_common.general import check_output_pipe
3536
from . import utils
@@ -207,6 +208,7 @@
207208

208209
MAX_ATTEMPTS = 50
209210
RETRY_SLEEP_SEC = 0.1
211+
EEPROM_RETRY_ERR_THRESHOLD = 10
210212

211213
STATE_DOWN = 'Down' # Initial state
212214
STATE_INIT = 'Initializing' # Module starts initializing, check module present, also power on the module if need
@@ -468,24 +470,44 @@ def wait_sfp_eeprom_ready(cls, sfp_list, wait_time):
468470

469471
def read_eeprom(self, offset, num_bytes, log_on_error=True):
470472
"""
471-
Read eeprom specific bytes beginning from a random offset with size as num_bytes. Tries up to 50 times total on every 0.1s.
473+
Read eeprom specific bytes beginning from a random offset with size as num_bytes.
474+
Retries up to 50 times in total (every 0.1s), but only if previous attempts failed due to I2C errors
475+
(errno.EIO, typically reported as -5 from the kernel).
476+
472477
Returns:
473-
bytearray, if raw sequence of bytes are read correctly from the offset of size num_bytes
474-
None, if the read_eeprom fails
478+
bytearray: If the data was successfully read.
479+
None: If all attempts failed (whether due to I2C errors after max retries, or due to other errors without retry).
475480
"""
476481
for attempt in range(MAX_ATTEMPTS):
477-
result = self._read_eeprom(offset, num_bytes, log_on_error)
482+
result, err = self._read_eeprom(offset, num_bytes, log_on_error)
478483
if result is not None:
479-
logger.log_notice(
484+
logger.log_debug(
480485
f"EEPROM read success after attempt {attempt + 1}/{MAX_ATTEMPTS} "
481486
f"(sfp={self.sdk_index}, offset={offset}, size={num_bytes})")
482487
return result
483-
if attempt < MAX_ATTEMPTS - 1: # only sleep if another retry will happen
484-
logger.log_notice(
485-
f"EEPROM read attempt {attempt + 1}/{MAX_ATTEMPTS} failed "
488+
489+
log_func = (logger.log_error if attempt + 1 > EEPROM_RETRY_ERR_THRESHOLD else logger.log_debug)
490+
491+
# Retry only on EIO (-5)
492+
if err == errno.EIO and attempt < MAX_ATTEMPTS - 1:
493+
log_func(
494+
f"EEPROM read attempt {attempt + 1}/{MAX_ATTEMPTS} failed with I2C error "
486495
f"(sfp={self.sdk_index}, offset={offset}, size={num_bytes}). "
487496
f"Retrying in {RETRY_SLEEP_SEC:.1f}s...")
488497
time.sleep(RETRY_SLEEP_SEC)
498+
continue
499+
500+
# Non-I2C-error or last attempt → stop retrying
501+
if err is not None:
502+
log_func(
503+
f"EEPROM read failed (errno={err}, {os.strerror(err)}) "
504+
f"after attempt {attempt + 1}/{MAX_ATTEMPTS} "
505+
f"(sfp={self.sdk_index}, offset={offset}, size={num_bytes})")
506+
else:
507+
log_func(
508+
f"EEPROM read failed after attempt {attempt + 1}/{MAX_ATTEMPTS} "
509+
f"(sfp={self.sdk_index}, offset={offset}, size={num_bytes})")
510+
return None
489511

490512
if log_on_error:
491513
logger.log_error(
@@ -503,26 +525,29 @@ def _read_eeprom(self, offset, num_bytes, log_on_error=True):
503525
log_on_error (bool, optional): whether log error when exception occurs. Defaults to True.
504526
505527
Returns:
506-
bytearray: the content of EEPROM
528+
(bytearray, None): On success, returns the data read and None for errno.
529+
(None, errno): On failure due to a specific OS error, returns None and the errno value.
530+
(None, None): On failure without an associated errno (e.g., empty page or invalid page).
507531
"""
508532
result = bytearray(0)
509533
while num_bytes > 0:
510534
_, page, page_offset = self._get_page_and_page_offset(offset)
511535
if not page:
512-
return None
536+
return None, None
513537

514538
try:
515539
with open(page, mode='rb', buffering=0) as f:
516540
f.seek(page_offset)
517541
content = f.read(num_bytes)
542+
read_length = len(content)
543+
if read_length == 0:
544+
logger.log_error(f'SFP {self.sdk_index}: EEPROM page {page} is empty, no data retrieved')
545+
return None, None
546+
518547
if not result:
519548
result = content
520549
else:
521550
result += content
522-
read_length = len(content)
523-
if read_length == 0:
524-
logger.log_error(f'SFP {self.sdk_index}: EEPROM page {page} is empty, no data retrieved')
525-
return None
526551
num_bytes -= read_length
527552
if num_bytes > 0:
528553
page_size = f.seek(0, os.SEEK_END)
@@ -531,48 +556,76 @@ def _read_eeprom(self, offset, num_bytes, log_on_error=True):
531556
else:
532557
# Indicate read finished
533558
num_bytes = 0
559+
534560
if ctypes.get_errno() != 0:
535-
raise IOError(f'errno = {os.strerror(ctypes.get_errno())}')
561+
return None, ctypes.get_errno()
536562

537563
logger.log_debug(
538564
f"read EEPROM sfp={self.sdk_index}, page={page}, page_offset={page_offset}, "
539565
f"size={read_length}, data={content}"
540566
)
541567

542-
except (OSError, IOError) as e:
568+
except OSError as e:
543569
if log_on_error:
544-
logger.log_warning(
545-
f"Failed to read sfp={self.sdk_index} EEPROM page={page}, page_offset={page_offset}, "
546-
f"size={num_bytes}, offset={offset}, error = {e}"
547-
)
548-
return None
570+
if e.errno is not None:
571+
logger.log_warning(
572+
f"Failed to read sfp={self.sdk_index} EEPROM page={page}, page_offset={page_offset}, "
573+
f"size={num_bytes}, offset={offset}, error={e} "
574+
f"(errno={e.errno}, {os.strerror(e.errno)})"
575+
)
576+
else:
577+
logger.log_warning(
578+
f"Failed to read sfp={self.sdk_index} EEPROM page={page}, page_offset={page_offset}, "
579+
f"size={num_bytes}, offset={offset}, error={e}"
580+
)
581+
return None, e.errno
582+
583+
return bytearray(result), None
549584

550-
return bytearray(result)
551585

552586
def write_eeprom(self, offset, num_bytes, write_buffer):
553587
"""
554-
write eeprom specific bytes beginning from a random offset with size as num_bytes
555-
and write_buffer as the required bytes. Tries up to 50 times total on every 0.1s.
588+
Write EEPROM specific bytes beginning from a random offset with size as num_bytes
589+
and write_buffer as the required bytes. Retries up to 50 times (every 0.1s) only if
590+
previous attempts failed due to I2C errors (errno.EIO).
591+
556592
Returns:
557-
Boolean, true if the write succeeded and false if it did not succeed.
593+
Boolean, True if the write succeeded, False if it did not succeed.
558594
"""
559595
if num_bytes != len(write_buffer):
560596
logger.log_error("Error mismatch between buffer length and number of bytes to be written")
561597
return False
562598

563599
for attempt in range(MAX_ATTEMPTS):
564-
ret = self._write_eeprom(offset, num_bytes, write_buffer)
600+
ret, err = self._write_eeprom(offset, num_bytes, write_buffer)
565601
if ret:
566-
logger.log_notice(
602+
logger.log_debug(
567603
f"EEPROM write success after attempt {attempt + 1}/{MAX_ATTEMPTS} "
568604
f"for sfp={self.sdk_index}, offset={offset}, size={num_bytes}")
569605
return True
570-
logger.log_notice(
571-
f"EEPROM write attempt {attempt + 1}/{MAX_ATTEMPTS} failed "
572-
f"for sfp={self.sdk_index}, offset={offset}, size={num_bytes}. "
573-
f"Retrying in {RETRY_SLEEP_SEC:.1f}s..."
574-
)
575-
time.sleep(RETRY_SLEEP_SEC)
606+
607+
log_func = (logger.log_error if attempt + 1 > EEPROM_RETRY_ERR_THRESHOLD else logger.log_debug)
608+
609+
# Retry only on EIO (-5)
610+
if err == errno.EIO and attempt < MAX_ATTEMPTS - 1:
611+
log_func(
612+
f"EEPROM write attempt {attempt + 1}/{MAX_ATTEMPTS} failed with I2C error "
613+
f"for sfp={self.sdk_index}, offset={offset}, size={num_bytes}. "
614+
f"Retrying in {RETRY_SLEEP_SEC:.1f}s...")
615+
time.sleep(RETRY_SLEEP_SEC)
616+
continue
617+
618+
# Non-I2C-error or last attempt → stop retrying
619+
if err is not None:
620+
log_func(
621+
f"EEPROM write failed (errno={err}, {os.strerror(err)}) "
622+
f"for sfp={self.sdk_index}, offset={offset}, size={num_bytes} "
623+
f"after attempt {attempt + 1}/{MAX_ATTEMPTS}")
624+
else:
625+
log_func(
626+
f"EEPROM write failed for sfp={self.sdk_index}, offset={offset}, size={num_bytes} "
627+
f"after attempt {attempt + 1}/{MAX_ATTEMPTS}")
628+
return False
576629

577630
logger.log_error(
578631
f"EEPROM write failed after {MAX_ATTEMPTS} attempts "
@@ -585,19 +638,23 @@ def _write_eeprom(self, offset, num_bytes, write_buffer):
585638
Single-attempt write: write eeprom specific bytes beginning from a random offset with size as num_bytes
586639
and write_buffer as the required bytes
587640
Returns:
588-
Boolean, true if the write succeeded and false if it did not succeed.
641+
(True, None): On success.
642+
(False, errno): On failure with a specific OS error.
643+
(False, None): On failure without an associated errno (e.g., unexpected short write).
644+
589645
Example:
590646
mlxreg -d /dev/mst/mt52100_pciconf0 --reg_name MCIA --indexes slot_index=0,module=1,device_address=154,page_number=5,i2c_device_address=0x50,size=1,bank_number=0 --set dword[0]=0x01000000 -y
591647
"""
592648
while num_bytes > 0:
593649
page_num, page, page_offset = self._get_page_and_page_offset(offset)
594650
if not page:
595-
return False
651+
return False, None
596652

597653
try:
598654
if self._is_write_protected(page_num, page_offset, num_bytes):
599655
# write limited eeprom is not supported
600-
raise IOError('write limited bytes')
656+
raise OSError(errno.EPERM, 'write limited bytes')
657+
601658
with open(page, mode='r+b', buffering=0) as f:
602659
f.seek(page_offset)
603660
ret = f.write(write_buffer[0:num_bytes])
@@ -609,19 +666,29 @@ def _write_eeprom(self, offset, num_bytes, write_buffer):
609666
write_buffer = write_buffer[ret:num_bytes]
610667
offset += ret
611668
else:
612-
raise IOError(f'write return code = {ret}')
669+
logger.log_error(
670+
f"Unexpected short write (ret={ret} < {num_bytes}) "
671+
f"for sfp={self.sdk_index}, page={page}, page_offset={page_offset}, offset={offset}"
672+
)
673+
return False, None
613674
num_bytes -= ret
614675
if ctypes.get_errno() != 0:
615-
raise IOError(f'errno = {os.strerror(ctypes.get_errno())}')
616-
logger.log_debug('write EEPROM sfp={}, page={}, page_offset={}, size={}, left={}, data={}'.format(self.sdk_index, page, page_offset, ret, num_bytes, written_buffer))
617-
except (OSError, IOError) as e:
676+
return False, ctypes.get_errno()
677+
678+
logger.log_debug(
679+
'write EEPROM sfp={}, page={}, page_offset={}, size={}, left={}, data={}'
680+
.format(self.sdk_index, page, page_offset, ret, num_bytes, written_buffer)
681+
)
682+
683+
except OSError as e:
618684
data = ''.join('{:02x}'.format(x) for x in write_buffer)
619685
logger.log_error(
620686
f"Failed to write EEPROM: sfp={self.sdk_index}, page={page}, page_offset={page_offset}, "
621687
f"size={num_bytes}, offset={offset}, data={data}, error={e}"
622688
)
623-
return False
624-
return True
689+
return False, e.errno
690+
691+
return True, None
625692

626693
def get_lpmode(self):
627694
"""

platform/mellanox/mlnx-platform-api/tests/test_sfp.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -245,7 +245,7 @@ def test_get_page_and_page_offset(self, mock_get_type_str, mock_eeprom_path, moc
245245
assert page_offset is 0
246246

247247
@mock.patch('sonic_platform.utils.read_int_from_file')
248-
@mock.patch('sonic_platform.sfp.SFP._read_eeprom')
248+
@mock.patch('sonic_platform.sfp.SFP.read_eeprom')
249249
def test_sfp_get_presence(self, mock_read, mock_read_int):
250250
sfp = SFP(0)
251251

0 commit comments

Comments
 (0)