Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix snmp agent not-responding issue when high CPU utilization #345

Merged
merged 5 commits into from
Jan 14, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/ax_interface/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@


class Agent:
def __init__(self, mib_cls, update_frequency, loop):
def __init__(self, mib_cls, enable_dynamic_frequency, update_frequency, loop):
if not type(mib_cls) is MIBMeta:
raise ValueError("Expected a class with type: {}".format(MIBMeta))

Expand All @@ -20,7 +20,7 @@ def __init__(self, mib_cls, update_frequency, loop):
self.stopped = asyncio.Event()

# Initialize our MIB
self.mib_table = MIBTable(mib_cls, update_frequency)
self.mib_table = MIBTable(mib_cls, enable_dynamic_frequency, update_frequency)

# containers
self.socket_mgr = SocketManager(self.mib_table, self.run_enabled, self.loop)
Expand Down
5 changes: 5 additions & 0 deletions src/ax_interface/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,3 +90,8 @@ class PduTypes(int, Enum):


DEFAULT_PDU_TIMEOUT = 5

# MIBUpdater rate: Interval/Execution time
UPDATE_FREQUENCY_RATE = 10
# MIBUpdater max update interval
MAX_UPDATE_INTERVAL = 60
44 changes: 42 additions & 2 deletions src/ax_interface/mib.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import asyncio
import bisect
import random
from datetime import datetime

from . import logger, util
from .constants import ValueType
from .encodings import ValueRepresentation
from .util import get_next_update_interval

"""
Update interval between update runs (in seconds).
Expand All @@ -16,6 +18,11 @@
"""
DEFAULT_REINIT_RATE = 60

"""
Disable dynamic frequency by default
"""
DEFAULT_ENABLE_DYNAMIC_FREQUENCY = False


class MIBUpdater:
"""
Expand All @@ -25,13 +32,15 @@ class MIBUpdater:
def __init__(self):
self.run_event = asyncio.Event()
self.frequency = DEFAULT_UPDATE_FREQUENCY
self.enable_dynamic_frequency = DEFAULT_ENABLE_DYNAMIC_FREQUENCY
self.reinit_rate = DEFAULT_REINIT_RATE // DEFAULT_UPDATE_FREQUENCY
self.update_counter = self.reinit_rate + 1 # reinit_data when init

async def start(self):
# Run the update while we are allowed
redis_exception_happen = False
while self.run_event.is_set():
start = datetime.now()
try:
# reinit internal structures
if self.update_counter > self.reinit_rate:
Expand All @@ -57,9 +66,36 @@ async def start(self):
# Any unexpected exception or error, log it and keep running
logger.exception("MIBUpdater.start() caught an unexpected exception during update_data()")

if self.enable_dynamic_frequency:
"""
On SONiC device with huge interfaces
for example RP card on ethernet chassis, including backend ports, 600+ interfaces
The update_data function could be very slow, especially when 100% CPU utilization.
for example ciscoSwitchQosMIB.QueueStatUpdater, uses 1-3 seconds on normal state.
uses 3-8 seconds on 100% CPU utilization state.
We use Asyncio/Coroutine as the basic framework,
the mib updaters share the same asyncio event loop with the SNMP agent client.
Hence during the updaters executing, the agent client can't receive/respond to new requests,

The high frequency and the long execution time
causes the SNMP request to be timed out on High CPU utilization.
The stable frequency(generally with default value 5s)
doesn't works well on this huge interfaces situation.
when the execution time is long,
wait for longer time to give back the control of asyncio event loop to SNMP agent
"""
execution_time = (datetime.now() - start).total_seconds()
next_frequency = get_next_update_interval(execution_time, self.frequency)

if next_frequency > self.frequency:
logger.debug(f"MIBUpdater type[{type(self)}] slow update detected, "
f"update execution time[{execution_time}], next_frequency[{next_frequency}]")
else:
next_frequency = self.frequency

# wait based on our update frequency before executing again.
# randomize to avoid concurrent update storms.
await asyncio.sleep(self.frequency + random.randint(-2, 2))
await asyncio.sleep(next_frequency + random.randint(-2, 2))

def reinit_data(self):
"""
Expand Down Expand Up @@ -275,10 +311,13 @@ class MIBTable(dict):
Simplistic LUT for Get/GetNext OID. Interprets iterables as keys and implements the same interfaces as dict's.
"""

def __init__(self, mib_cls, update_frequency=DEFAULT_UPDATE_FREQUENCY):
def __init__(self, mib_cls,
enable_dynamic_frequency=DEFAULT_ENABLE_DYNAMIC_FREQUENCY,
update_frequency=DEFAULT_UPDATE_FREQUENCY):
if type(mib_cls) is not MIBMeta:
raise ValueError("Supplied object is not a MIB class instance.")
super().__init__(getattr(mib_cls, MIBMeta.KEYSTORE))
self.enable_dynamic_frequency = enable_dynamic_frequency
self.update_frequency = update_frequency
self.updater_instances = getattr(mib_cls, MIBMeta.UPDATERS)
self.prefixes = getattr(mib_cls, MIBMeta.PREFIXES)
Expand All @@ -296,6 +335,7 @@ def start_background_tasks(self, event):
tasks = []
for updater in self.updater_instances:
updater.frequency = self.update_frequency
updater.enable_dynamic_frequency = self.enable_dynamic_frequency
updater.run_event = event
fut = asyncio.ensure_future(updater.start())
fut.add_done_callback(MIBTable._done_background_task_callback)
Expand Down
29 changes: 29 additions & 0 deletions src/ax_interface/util.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import ipaddress
import math
import re

from ax_interface import constants
Expand Down Expand Up @@ -108,3 +109,31 @@ def ip2byte_tuple(ip):
"""
return tuple(i for i in ipaddress.ip_address(ip).packed)


def get_next_update_interval(execution_time, static_frequency):
"""
>>> get_next_update_interval(0.4, 5)
5
>>> get_next_update_interval(0.87, 5)
9
>>> get_next_update_interval(18.88, 5)
60


:param static_frequency: Static frequency, generally use default value 5
:param execution_time: The execution time of the updater
:return: the interval before next update

We expect the rate of 'update interval'/'update execution time' >= UPDATE_FREQUENCY_RATE(10)
Because we're using asyncio/Coroutines, the update execution blocks SNMP proxy service and other updaters.
Generally we expect the update to be quick and the execution time/interval time < 0.25
Given the static_frequency == 5,
if the execution_time < 0.5,
the update interval is(for example) 1.1s
It sleeps 1.1s * 10 = 11s before run next update

"""
frequency_based_on_execution_time = math.ceil(execution_time * constants.UPDATE_FREQUENCY_RATE)
frequency_based_on_execution_time = min(frequency_based_on_execution_time, constants.MAX_UPDATE_INTERVAL)

return max(static_frequency, frequency_based_on_execution_time)
7 changes: 4 additions & 3 deletions src/sonic_ax_impl/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,12 @@
import shutil
import sys

import sonic_py_common.util
from swsscommon import swsscommon

import ax_interface
import sonic_ax_impl

from .utils.arg_parser import process_options
from . import mibs

LOG_FORMAT = "snmp-subagent [%(name)s] %(levelname)s: %(message)s"
Expand Down Expand Up @@ -70,7 +71,7 @@ def logging_level_to_swss_level(log_level):
sys.exit(0)

# import command line arguments
args = sonic_py_common.util.process_options("sonic_ax_impl")
args = process_options("sonic_ax_impl")

# configure logging. If debug '-d' is specified, logs to stdout at designated level. syslog/INFO otherwise.
log_level = log_level_sdk = args.get('log_level')
Expand Down Expand Up @@ -110,4 +111,4 @@ def logging_level_to_swss_level(log_level):

from .main import main

main(update_frequency=args.get('update_frequency'))
main(enable_dynamic_frequency=args.get('enable_dynamic_frequency'), update_frequency=args.get('update_frequency'))
4 changes: 2 additions & 2 deletions src/sonic_ax_impl/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,14 +54,14 @@ def shutdown(signame, agent):
shutdown_task = event_loop.create_task(agent.shutdown())


def main(update_frequency=None):
def main(enable_dynamic_frequency=False, update_frequency=None):
global event_loop

try:
Namespace.init_sonic_db_config()

# initialize handler and set update frequency (or use the default)
agent = ax_interface.Agent(SonicMIB, update_frequency or DEFAULT_UPDATE_FREQUENCY, event_loop)
agent = ax_interface.Agent(SonicMIB, enable_dynamic_frequency, update_frequency or DEFAULT_UPDATE_FREQUENCY, event_loop)

# add "shutdown" signal handlers
# https://docs.python.org/3.5/library/asyncio-eventloop.html#set-signal-handlers-for-sigint-and-sigterm
Expand Down
Empty file.
39 changes: 39 additions & 0 deletions src/sonic_ax_impl/utils/arg_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from __future__ import print_function
import sys
from getopt import getopt


def usage(script_name):
print('Usage: python ', script_name,
'-t [host] -p [port] -s [unix_socket_path] -d [logging_level] -f [update_frequency] -r [enable_dynamic_frequency] -h [help]')


def process_options(script_name):
"""
Process command line options
"""
options, remainders = getopt(sys.argv[1:], "t:p:s:d:f:rh", ["host=", "port=", "unix_socket_path=", "debug=", "frequency=", "enable_dynamic_frequency", "help"])

args = {}
for (opt, arg) in options:
try:
if opt in ('-d', '--debug'):
args['log_level'] = int(arg)
elif opt in ('-t', '--host'):
args['host'] = arg
elif opt in ('-p', '--port'):
args['port'] = int(arg)
elif opt in ('-s', '--unix_socket_path'):
args['unix_socket_path'] = arg
elif opt in ('-f', '--frequency'):
args['update_frequency'] = int(arg)
elif opt in ('-r', '--enable_dynamic_frequency'):
args['enable_dynamic_frequency'] = True
elif opt in ('-h', '--help'):
usage(script_name)
sys.exit(0)
except ValueError as e:
print('Invalid option for {}: {}'.format(opt, e))
sys.exit(1)

return args
2 changes: 1 addition & 1 deletion tests/test_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,6 @@ async def delayed_shutdown(self, agent):

def test_agent_loop(self):
event_loop = asyncio.get_event_loop()
agent = ax_interface.Agent(SonicMIB, 5, event_loop)
agent = ax_interface.Agent(SonicMIB, False, 5, event_loop)
event_loop.create_task(self.delayed_shutdown(agent))
event_loop.run_until_complete(agent.run_in_event_loop())
Loading
Loading