Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SmartSwitch] Enhance PCIe device check to skip the warning log, if device is in detaching mode #546

Merged
merged 23 commits into from
Feb 6, 2025
Merged
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
7a9265c
Skip logging the warning, if device is in detaching mode
vvolam Oct 4, 2024
05ca910
Add detach_info table and unittests
vvolam Nov 5, 2024
f1a6ce2
Merge remote-tracking branch 'origin/master' into ss-reboot
vvolam Nov 12, 2024
4257a65
Fix unit tests
vvolam Nov 12, 2024
13d883f
Increase code coverage
vvolam Nov 13, 2024
a4340d2
Remove unused header import
vvolam Nov 13, 2024
1ab0206
Fix dict get values
vvolam Nov 13, 2024
16a28a3
Merge remote-tracking branch 'origin/master' into ss-reboot
vvolam Nov 18, 2024
039e8fd
Increase code coverage
vvolam Nov 18, 2024
3ca17b0
Increase test coverage
vvolam Nov 18, 2024
f5687d7
[SmartSwitch] Extend implementation of the DPU chassis daemon. (#563)
oleksandrivantsiv Nov 20, 2024
8af10ab
Addition of DPU Chassis for thermalctld (#564)
gpunathilell Nov 27, 2024
6f62340
[stormond] Added new dynamic field 'last_sync_time' to STATE_DB (#535)
assrinivasan Nov 27, 2024
fc28cae
[lag_id] Add lagid to free_list when LC absent for 30 minutes (#542)
mlok-nokia Nov 27, 2024
db19064
Fixed bug in chassisd causing incorrect number of ASICs in CHASSIS_ST…
vivekverma-arista Dec 4, 2024
b290945
thermalctld: Add support for fans on non-CPU modules (#555)
patrickmacarthur Dec 6, 2024
fa2ca45
Advanced Azure pipeline to Bookworm (#572)
assrinivasan Dec 12, 2024
24c659b
Take non-CMIS xcvrs out of lpmode in SFF Manager (#565)
peterbailey-arista Dec 14, 2024
e2cb760
Added SmartSwitch support in chassisd and enabling chassisd (#467)
rameshraghupathy Dec 16, 2024
6d4a859
[chassis][psud] Move the PSU parent information generation to the loo…
yejianquan Dec 17, 2024
564ecd0
[chassisd] Address the chassisd crash issue and add UT for it (#573)
mlok-nokia Dec 18, 2024
2f92262
Merge remote-tracking branch 'public/master' into ss-reboot
vvolam Jan 9, 2025
57e021c
Fix a comment
vvolam Jan 17, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions sonic-pcied/scripts/pcied
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ SYSLOG_IDENTIFIER = "pcied"
PCIE_RESULT_REGEX = "PCIe Device Checking All Test"
PCIE_DEVICE_TABLE_NAME = "PCIE_DEVICE"
PCIE_STATUS_TABLE_NAME = "PCIE_DEVICES"
PCIE_DETACH_INFO_TABLE = "PCIE_DETACH_INFO"

PCIE_DETACH_BUS_INFO_FIELD = "bus_info"
PCIE_DETACH_DPU_STATE_FIELD = "dpu_state"

PCIED_MAIN_THREAD_SLEEP_SECS = 60

Expand Down Expand Up @@ -92,6 +96,7 @@ class DaemonPcied(daemon_base.DaemonBase):
self.state_db = daemon_base.db_connect("STATE_DB")
self.device_table = swsscommon.Table(self.state_db, PCIE_DEVICE_TABLE_NAME)
self.status_table = swsscommon.Table(self.state_db, PCIE_STATUS_TABLE_NAME)
self.detach_info = swsscommon.Table(self.state_db, PCIE_DETACH_INFO_TABLE)
vvolam marked this conversation as resolved.
Show resolved Hide resolved

def __del__(self):
if self.device_table:
Expand All @@ -102,6 +107,10 @@ class DaemonPcied(daemon_base.DaemonBase):
stable_keys = self.status_table.getKeys()
for stk in stable_keys:
self.status_table._del(stk)
if self.detach_info:
detach_info_keys = self.detach_info.getKeys()
for dk in detach_info_keys:
self.detach_info._del(dk)

# load aer-fields into statedb
def update_aer_to_statedb(self):
Expand Down Expand Up @@ -151,6 +160,28 @@ class DaemonPcied(daemon_base.DaemonBase):

self.status_table.set("status", fvs)

# Check if any interface is in detaching mode by querying the state_db
vvolam marked this conversation as resolved.
Show resolved Hide resolved
def is_dpu_in_detaching_mode(self, pcie_dev):
# Ensure detach_info is not None
if self.detach_info is None:
self.log_debug("detach_info is None")
return False

# Query the state_db for the device detaching status
detach_info_keys = list(self.detach_info.getKeys())
if not detach_info_keys:
return False

for key in detach_info_keys:
dpu_info = self.detach_info.get(key)
if dpu_info:
bus_info = dpu_info.get(PCIE_DETACH_BUS_INFO_FIELD)
dpu_state = dpu_info.get(PCIE_DETACH_DPU_STATE_FIELD)
if bus_info == pcie_dev and dpu_state == "detaching":
vvolam marked this conversation as resolved.
Show resolved Hide resolved
return True

return False

# Check the PCIe devices
def check_pcie_devices(self):
self.resultInfo = platform_pcieutil.get_pcie_check()
Expand All @@ -160,6 +191,14 @@ class DaemonPcied(daemon_base.DaemonBase):

for result in self.resultInfo:
if result["result"] == "Failed":
# Convert bus, device, and function to a bus_info format like "0000:03:00.0"
pcie_dev = "0000:{int(result['bus'], 16):02x}:{int(result['dev'], 16):02x}.{int(result['fn'], 16)}"

# Check if the device is in detaching mode
if device_info.is_smartswitch() and self.is_dpu_in_detaching_mode(pcie_dev):
self.log_debug("PCIe Device: {} is in detaching mode, skipping warning.".format(pcie_dev))
continue

self.log_warning("PCIe Device: " + result["name"] + " Not Found")
err += 1
else:
Expand Down
73 changes: 71 additions & 2 deletions sonic-pcied/tests/test_DaemonPcied.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,17 +143,86 @@ def test_run(self):
daemon_pcied.run()
assert daemon_pcied.check_pcie_devices.call_count == 1

@mock.patch('pcied.load_platform_pcieutil', mock.MagicMock())
def test_is_dpu_in_detaching_mode(self):
daemon_pcied = pcied.DaemonPcied(SYSLOG_IDENTIFIER)
daemon_pcied.detach_info = mock.MagicMock()
daemon_pcied.detach_info.getKeys = mock.MagicMock(return_value=['DPU_0', 'DPU_1'])
daemon_pcied.detach_info.get = mock.MagicMock(
side_effect=lambda key: {
'DPU_0': {'bus_info': '0000:03:00.1', 'dpu_state': 'detaching'},
'DPU_1': {'bus_info': '0000:03:00.2', 'dpu_state': 'attached'}
}.get(key, None)
)

# Test when the device is in detaching mode
assert daemon_pcied.is_dpu_in_detaching_mode('0000:03:00.1') == True

# Test when the device is not in detaching mode
assert daemon_pcied.is_dpu_in_detaching_mode('0000:03:00.2') == False

# Test when the device does not exist in detach_info
assert daemon_pcied.is_dpu_in_detaching_mode('0000:03:00.3') == False

# Test when detach_info is None
daemon_pcied.detach_info = None
assert daemon_pcied.is_dpu_in_detaching_mode('0000:03:00.1') == False

# Test when detach_info has no keys
daemon_pcied.detach_info = mock.MagicMock()
daemon_pcied.detach_info.getKeys.return_value = []
assert daemon_pcied.is_dpu_in_detaching_mode('0000:03:00.1') == False

@mock.patch('pcied.device_info.is_smartswitch', mock.MagicMock(return_value=False))
@mock.patch('pcied.DaemonPcied.is_dpu_in_detaching_mode', mock.MagicMock(return_value=False))
@mock.patch('pcied.load_platform_pcieutil', mock.MagicMock())
def test_check_pcie_devices(self):
daemon_pcied = pcied.DaemonPcied(SYSLOG_IDENTIFIER)
daemon_pcied.update_pcie_devices_status_db = mock.MagicMock()
daemon_pcied.check_n_update_pcie_aer_stats = mock.MagicMock()
pcied.platform_pcieutil.get_pcie_check = mock.MagicMock()
pcied.platform_pcieutil.get_pcie_check = mock.MagicMock(
return_value=[
{"result": "Failed", "bus": "03", "dev": "00", "fn": "1", "name": "PCIe Device 1"},
]
)

daemon_pcied.check_pcie_devices()
assert daemon_pcied.update_pcie_devices_status_db.call_count == 1
assert daemon_pcied.check_n_update_pcie_aer_stats.call_count == 0

@mock.patch('pcied.device_info.is_smartswitch', mock.MagicMock(return_value=False))
@mock.patch('pcied.DaemonPcied.is_dpu_in_detaching_mode', mock.MagicMock(return_value=False))
@mock.patch('pcied.load_platform_pcieutil', mock.MagicMock())
def test_check_pcie_devices_update_aer(self):
daemon_pcied = pcied.DaemonPcied(SYSLOG_IDENTIFIER)
daemon_pcied.update_pcie_devices_status_db = mock.MagicMock()
daemon_pcied.check_n_update_pcie_aer_stats = mock.MagicMock()
pcied.platform_pcieutil.get_pcie_check = mock.MagicMock(
return_value=[
{"result": "Passed", "bus": "03", "dev": "00", "fn": "1", "name": "PCIe Device 1"},
]
)

daemon_pcied.check_pcie_devices()
assert daemon_pcied.update_pcie_devices_status_db.call_count == 1
assert daemon_pcied.check_n_update_pcie_aer_stats.call_count == 1

@mock.patch('pcied.device_info.is_smartswitch', mock.MagicMock(return_value=True))
@mock.patch('pcied.DaemonPcied.is_dpu_in_detaching_mode', mock.MagicMock(return_value=True))
@mock.patch('pcied.load_platform_pcieutil', mock.MagicMock())
def test_check_pcie_devices_detaching(self):
daemon_pcied = pcied.DaemonPcied(SYSLOG_IDENTIFIER)
daemon_pcied.update_pcie_devices_status_db = mock.MagicMock()
daemon_pcied.check_n_update_pcie_aer_stats = mock.MagicMock()
pcied.platform_pcieutil.get_pcie_check = mock.MagicMock(
return_value=[
{"result": "Failed", "bus": "03", "dev": "00", "fn": "1", "name": "PCIe Device 1"},
]
)

daemon_pcied.check_pcie_devices()
assert daemon_pcied.update_pcie_devices_status_db.call_count == 1
assert daemon_pcied.check_n_update_pcie_aer_stats.call_count == 0

@mock.patch('pcied.load_platform_pcieutil', mock.MagicMock())
def test_update_pcie_devices_status_db(self):
Expand Down Expand Up @@ -210,5 +279,5 @@ def test_update_aer_to_statedb(self):
])
"""

daemon_pcied.update_aer_to_statedb()
daemon_pcied.update_aer_to_statedb()
assert daemon_pcied.log_debug.call_count == 0
Loading