Skip to content
207 changes: 163 additions & 44 deletions platform/mellanox/mlnx-platform-api/sonic_platform/sfp.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import os
import threading
import time
import errno
from sonic_py_common.logger import Logger
from sonic_py_common.general import check_output_pipe
from . import utils
Expand Down Expand Up @@ -205,6 +206,10 @@
CMIS_MCI_EEPROM_OFFSET = 2
CMIS_MCI_MASK = 0b00001100

MAX_ATTEMPTS = 5
RETRY_SLEEP_SEC = 0.1
EEPROM_RETRY_ERR_THRESHOLD = 2

STATE_DOWN = 'Down' # Initial state
STATE_INIT = 'Initializing' # Module starts initializing, check module present, also power on the module if need
STATE_RESETTING = 'Resetting' # Module is resetting the firmware
Expand Down Expand Up @@ -441,17 +446,14 @@ def get_presence(self):
bool: True if device is present, False if not
"""
presence_sysfs = f'/sys/module/sx_core/asic0/module{self.sdk_index}/hw_present' if self.is_sw_control() else f'/sys/module/sx_core/asic0/module{self.sdk_index}/present'
if utils.read_int_from_file(presence_sysfs) != 1:
return False
eeprom_raw = self._read_eeprom(0, 1, log_on_error=False)
return eeprom_raw is not None
return utils.read_int_from_file(presence_sysfs) == 1

@classmethod
def wait_sfp_eeprom_ready(cls, sfp_list, wait_time):
not_ready_list = sfp_list

while wait_time > 0:
not_ready_list = [s for s in not_ready_list if s.state == STATE_FW_CONTROL and s._read_eeprom(0, 2,False) is None]
not_ready_list = [s for s in not_ready_list if s.state == STATE_FW_CONTROL and s._read_eeprom(0, 1,False) is None]
if not_ready_list:
time.sleep(0.1)
wait_time -= 0.1
Expand All @@ -461,45 +463,86 @@ def wait_sfp_eeprom_ready(cls, sfp_list, wait_time):
for s in not_ready_list:
logger.log_error(f'SFP {s.sdk_index} eeprom is not ready')

# read eeprom specfic bytes beginning from offset with size as num_bytes
def read_eeprom(self, offset, num_bytes):
def read_eeprom(self, offset, num_bytes, log_on_error=True):
"""
Read eeprom specfic bytes beginning from a random offset with size as num_bytes
Read eeprom specific bytes beginning from a random offset with size as num_bytes.
Retries up to 5 times in total (every 0.1s), but only if previous attempts failed due to I2C errors
(errno.EIO, typically reported as -5 from the kernel).

Returns:
bytearray, if raw sequence of bytes are read correctly from the offset of size num_bytes
None, if the read_eeprom fails
"""
return self._read_eeprom(offset, num_bytes)
bytearray: If the data was successfully read.
None: If all attempts failed (whether due to I2C errors after max retries, or due to other errors without retry).
"""
for attempt in range(MAX_ATTEMPTS):
result, err = self._read_eeprom(offset, num_bytes, log_on_error)
if result is not None:
if attempt > 0:
logger.log_notice(
f"EEPROM read success after attempt {attempt + 1}/{MAX_ATTEMPTS} "
f"(sfp={self.sdk_index}, offset={offset}, size={num_bytes})")
return result

log_func = (logger.log_error if attempt + 1 > EEPROM_RETRY_ERR_THRESHOLD else logger.log_debug)

# Retry only on EIO (-5)
if err == errno.EIO and attempt < MAX_ATTEMPTS - 1:
log_func(
f"EEPROM read attempt {attempt + 1}/{MAX_ATTEMPTS} failed with I2C error "
f"(sfp={self.sdk_index}, offset={offset}, size={num_bytes}). "
f"Retrying in {RETRY_SLEEP_SEC:.1f}s...")
time.sleep(RETRY_SLEEP_SEC)
continue

# Non-I2C-error or last attempt → stop retrying
if err is not None:
log_func(
f"EEPROM read failed (errno={err}, {os.strerror(err)}) "
f"after attempt {attempt + 1}/{MAX_ATTEMPTS} "
f"(sfp={self.sdk_index}, offset={offset}, size={num_bytes})")
else:
log_func(
f"EEPROM read failed after attempt {attempt + 1}/{MAX_ATTEMPTS} "
f"(sfp={self.sdk_index}, offset={offset}, size={num_bytes})")
return None

logger.log_error(
f"EEPROM read failed after {MAX_ATTEMPTS} attempts "
f"(sfp={self.sdk_index}, offset={offset}, size={num_bytes})")
return None

def _read_eeprom(self, offset, num_bytes, log_on_error=True):
"""Read eeprom specfic bytes beginning from a random offset with size as num_bytes
"""
Single-attempt read: Read eeprom specific bytes beginning from a random offset with size as num_bytes

Args:
offset (int): read offset
num_bytes (int): read size
log_on_error (bool, optional): whether log error when exception occurs. Defaults to True.

Returns:
bytearray: the content of EEPROM
(bytearray, None): On success, returns the data read and None for errno.
(None, errno): On failure due to a specific OS error, returns None and the errno value.
(None, None): On failure without an associated errno (e.g., empty page or invalid page).
"""
result = bytearray(0)
while num_bytes > 0:
_, page, page_offset = self._get_page_and_page_offset(offset)
if not page:
return None
return None, None

try:
with open(page, mode='rb', buffering=0) as f:
f.seek(page_offset)
content = f.read(num_bytes)
read_length = len(content)
if read_length == 0:
logger.log_error(f'SFP {self.sdk_index}: EEPROM page {page} is empty, no data retrieved')
return None, None

if not result:
result = content
else:
result += content
read_length = len(content)
if read_length == 0:
logger.log_error(f'SFP {self.sdk_index}: EEPROM page {page} is empty, no data retrieved')
return None
num_bytes -= read_length
if num_bytes > 0:
page_size = f.seek(0, os.SEEK_END)
Expand All @@ -508,41 +551,106 @@ def _read_eeprom(self, offset, num_bytes, log_on_error=True):
else:
# Indicate read finished
num_bytes = 0

if ctypes.get_errno() != 0:
raise IOError(f'errno = {os.strerror(ctypes.get_errno())}')
logger.log_debug(f'read EEPROM sfp={self.sdk_index}, page={page}, page_offset={page_offset}, '\
f'size={read_length}, data={content}')
except (OSError, IOError) as e:
return None, ctypes.get_errno()

logger.log_debug(
f"read EEPROM sfp={self.sdk_index}, page={page}, page_offset={page_offset}, "
f"size={read_length}, data={content}"
)

except OSError as e:
if log_on_error:
logger.log_warning(f'Failed to read sfp={self.sdk_index} EEPROM page={page}, page_offset={page_offset}, '\
f'size={num_bytes}, offset={offset}, error = {e}')
return None
if e.errno is not None:
logger.log_warning(
f"Failed to read sfp={self.sdk_index} EEPROM page={page}, page_offset={page_offset}, "
f"size={num_bytes}, offset={offset}, error={e} "
f"(errno={e.errno}, {os.strerror(e.errno)})"
)
else:
logger.log_warning(
f"Failed to read sfp={self.sdk_index} EEPROM page={page}, page_offset={page_offset}, "
f"size={num_bytes}, offset={offset}, error={e}"
)
return None, e.errno

return bytearray(result), None

return bytearray(result)

# write eeprom specfic bytes beginning from offset with size as num_bytes
def write_eeprom(self, offset, num_bytes, write_buffer):
"""
write eeprom specfic bytes beginning from a random offset with size as num_bytes
and write_buffer as the required bytes
Write EEPROM specific bytes beginning from a random offset with size as num_bytes
and write_buffer as the required bytes. Retries up to 5 times (every 0.1s) only if
previous attempts failed due to I2C errors (errno.EIO).

Returns:
Boolean, true if the write succeeded and false if it did not succeed.
Example:
mlxreg -d /dev/mst/mt52100_pciconf0 --reg_name MCIA --indexes slot_index=0,module=1,device_address=154,page_number=5,i2c_device_address=0x50,size=1,bank_number=0 --set dword[0]=0x01000000 -y
Boolean, True if the write succeeded, False if it did not succeed.
"""
if num_bytes != len(write_buffer):
logger.log_error("Error mismatch between buffer length and number of bytes to be written")
return False

for attempt in range(MAX_ATTEMPTS):
ret, err = self._write_eeprom(offset, num_bytes, write_buffer)
if ret:
if attempt > 0:
logger.log_notice(
f"EEPROM write success after attempt {attempt + 1}/{MAX_ATTEMPTS} "
f"for sfp={self.sdk_index}, offset={offset}, size={num_bytes}")
return True

log_func = (logger.log_error if attempt + 1 > EEPROM_RETRY_ERR_THRESHOLD else logger.log_debug)

# Retry only on EIO (-5)
if err == errno.EIO and attempt < MAX_ATTEMPTS - 1:
log_func(
f"EEPROM write attempt {attempt + 1}/{MAX_ATTEMPTS} failed with I2C error "
f"for sfp={self.sdk_index}, offset={offset}, size={num_bytes}. "
f"Retrying in {RETRY_SLEEP_SEC:.1f}s...")
time.sleep(RETRY_SLEEP_SEC)
continue

# Non-I2C-error or last attempt → stop retrying
if err is not None:
log_func(
f"EEPROM write failed (errno={err}, {os.strerror(err)}) "
f"for sfp={self.sdk_index}, offset={offset}, size={num_bytes} "
f"after attempt {attempt + 1}/{MAX_ATTEMPTS}")
else:
log_func(
f"EEPROM write failed for sfp={self.sdk_index}, offset={offset}, size={num_bytes} "
f"after attempt {attempt + 1}/{MAX_ATTEMPTS}")
return False

logger.log_error(
f"EEPROM write failed after {MAX_ATTEMPTS} attempts "
f"for sfp={self.sdk_index}, offset={offset}, size={num_bytes}"
)
return False

def _write_eeprom(self, offset, num_bytes, write_buffer):
"""
Single-attempt write: write eeprom specific bytes beginning from a random offset with size as num_bytes
and write_buffer as the required bytes
Returns:
(True, None): On success.
(False, errno): On failure with a specific OS error.
(False, None): On failure without an associated errno (e.g., unexpected short write).

Example:
mlxreg -d /dev/mst/mt52100_pciconf0 --reg_name MCIA --indexes slot_index=0,module=1,device_address=154,page_number=5,i2c_device_address=0x50,size=1,bank_number=0 --set dword[0]=0x01000000 -y
"""
while num_bytes > 0:
page_num, page, page_offset = self._get_page_and_page_offset(offset)
if not page:
return False
return False, None

try:
if self._is_write_protected(page_num, page_offset, num_bytes):
# write limited eeprom is not supported
raise IOError('write limited bytes')
raise OSError(errno.EPERM, 'write limited bytes')

with open(page, mode='r+b', buffering=0) as f:
f.seek(page_offset)
ret = f.write(write_buffer[0:num_bytes])
Expand All @@ -554,18 +662,29 @@ def write_eeprom(self, offset, num_bytes, write_buffer):
write_buffer = write_buffer[ret:num_bytes]
offset += ret
else:
raise IOError(f'write return code = {ret}')
logger.log_error(
f"Unexpected short write (ret={ret} < {num_bytes}) "
f"for sfp={self.sdk_index}, page={page}, page_offset={page_offset}, offset={offset}"
)
return False, None
num_bytes -= ret
if ctypes.get_errno() != 0:
raise IOError(f'errno = {os.strerror(ctypes.get_errno())}')
logger.log_debug(f'write EEPROM sfp={self.sdk_index}, page={page}, page_offset={page_offset}, '\
f'size={ret}, left={num_bytes}, data={written_buffer}')
except (OSError, IOError) as e:
return False, ctypes.get_errno()

logger.log_debug(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@tshalvi can we move this debug log to the beginning so that we do'nt miss the log if there is error a line 674?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I’d prefer to keep the debug log after the write, since the goal is to record the actual completed write.
Moving it before the write would log an attempted write, even if it later fails, which could be misleading.

'write EEPROM sfp={}, page={}, page_offset={}, size={}, left={}, data={}'
.format(self.sdk_index, page, page_offset, ret, num_bytes, written_buffer)
)

except OSError as e:
data = ''.join('{:02x}'.format(x) for x in write_buffer)
logger.log_error(f'Failed to write EEPROM data sfp={self.sdk_index} EEPROM page={page}, page_offset={page_offset}, size={num_bytes}, '\
f'offset={offset}, data = {data}, error = {e}')
return False
return True
logger.log_error(
f"Failed to write EEPROM: sfp={self.sdk_index}, page={page}, page_offset={page_offset}, "
f"size={num_bytes}, offset={offset}, data={data}, error={e}"
)
return False, e.errno

return True, None

def get_lpmode(self):
"""
Expand Down
9 changes: 1 addition & 8 deletions platform/mellanox/mlnx-platform-api/tests/test_sfp.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,20 +245,13 @@ def test_get_page_and_page_offset(self, mock_get_type_str, mock_eeprom_path, moc
assert page_offset is 0

@mock.patch('sonic_platform.utils.read_int_from_file')
@mock.patch('sonic_platform.sfp.SFP._read_eeprom')
def test_sfp_get_presence(self, mock_read, mock_read_int):
def test_sfp_get_presence(self, mock_read_int):
sfp = SFP(0)

mock_read_int.return_value = 1
mock_read.return_value = None
assert not sfp.get_presence()
mock_read.return_value = 0
assert sfp.get_presence()

mock_read_int.return_value = 0
mock_read.return_value = None
assert not sfp.get_presence()
mock_read.return_value = 0
assert not sfp.get_presence()

@mock.patch('sonic_platform.utils.read_int_from_file')
Expand Down