From a1833d0fa54f585db386ec572ff5ceeb8bab2f03 Mon Sep 17 00:00:00 2001 From: Sebastian Wagner Date: Thu, 4 Apr 2024 12:34:04 +0200 Subject: [PATCH 1/2] Remove ordering code for external interface IPs `external_fixed_ips` was ordered to become a stable primary IP address for the interface. This code is not necessary anymore, as if there are more than 1 fixed IPs on the external interface, `ASR1KPluginBase._update_router_gw_info()` will record these IPs in the NAT pool RouterAtts. `GatewayInterface._ip_address()` will then again use this information and determine the stable primary IP of the GatwayInterface. Consequently an ordering is not needed at this place anymore. This code was also making sure that each supplied fixed IP got a prefixlen. I am unsure how it would even get at this place, without a prefixlen, as everything needs to have a subnet which determines the prefixlen. --- .../service_plugins/l3_extension_adapter.py | 28 ------------------- 1 file changed, 28 deletions(-) diff --git a/asr1k_neutron_l3/plugins/l3/service_plugins/l3_extension_adapter.py b/asr1k_neutron_l3/plugins/l3/service_plugins/l3_extension_adapter.py index 7eaffbcc..d18dbf57 100644 --- a/asr1k_neutron_l3/plugins/l3/service_plugins/l3_extension_adapter.py +++ b/asr1k_neutron_l3/plugins/l3/service_plugins/l3_extension_adapter.py @@ -269,34 +269,6 @@ def get_sync_data(self, context, router_ids=None, active=None, host=None): router_att = router_atts.get(router['id'], {}) router[constants.ASR1K_ROUTER_ATTS_KEY] = router_att - # Make sure the gateway IPs all have prefixes and are sorted consistently - # this is to prevent foo when we have to assign to nat pool, because we - # can guarantee a consistent order from neutron and we can't change the - # pool on the active device and it has (currently) to be different from - # the interface device. - - gw_info = router.get('external_gateway_info', None) - gw_port = router.get('gw_port', None) - if gw_port is not None: - ips = gw_port.get('fixed_ips', []) - prefixes = {} - if bool(ips): - for ip in ips: - prefix = ip.get('prefixlen', None) - subnet_id = ip.get('subnet_id', None) - if prefix is not None and subnet_id is not None: - prefixes[subnet_id] = prefix - - for ip in ips: - if ip.get('prefixlen', None) is None: - prefix = prefixes.get(ip.get('subnet_id', None)) - if prefix is not None: - ip['prefixlen'] = prefix - - gw_port['fixed_ips'] = sorted(ips, key=lambda k: k.get('ip_address')) - if gw_info is not None: - gw_info['external_fixed_ips'] = gw_port['fixed_ips'] - rt_import = [] rt_export = [] bgpvpns = self.db.get_bgpvpns_by_router_id(context, router['id']) From e592e81e93e537e9f5184e3733fce24cdaeac908 Mon Sep 17 00:00:00 2001 From: Sebastian Wagner Date: Mon, 15 Apr 2024 17:16:44 +0200 Subject: [PATCH 2/2] [WIP] Prevent uncleaned external interface IP to cause firmware race condition The current firmware mal-allocates an external interfaces NAT pool (the pointer to the hash-table, I presume) if there is an external interface with the same IP in another VRF. We believe this is erroneous behavior as VRFs are supposed to carry overlapping IP spaces, yet _the vendor_ argues otherwise. Generally we would like to maintain the support of overlapping NAT overload IPs at least from a driver perspective as theoretically this would be valid in OpenStack with multiple adress scopes, never the less it is unlikely to be seen in our infra. This commit implements a preflight check, checking if any other outside interface is present that carries the same IP as our to be programmed interface. If that is, we must check if that is an uncleaned residue from a VRF that is still to be cleaned or if it is a valid interface. If it is not, we delete that interface and program the new interface with the matching IP, thereby preventing the mal-allocation of the internal NAT pool. --- asr1k_neutron_l3/common/utils.py | 14 +++++++++- .../models/netconf_yang/l3_interface.py | 10 +++++++ .../models/neutron/l3/interface.py | 28 +++++++++---------- asr1k_neutron_l3/models/neutron/l3/router.py | 2 +- asr1k_neutron_l3/plugins/db/asr1k_db.py | 19 +++++++++++++ .../plugins/l3/agents/asr1k_l3_agent.py | 1 - .../service_plugins/l3_extension_adapter.py | 10 +++++++ 7 files changed, 66 insertions(+), 18 deletions(-) diff --git a/asr1k_neutron_l3/common/utils.py b/asr1k_neutron_l3/common/utils.py index bfac4f6d..99ab19cf 100644 --- a/asr1k_neutron_l3/common/utils.py +++ b/asr1k_neutron_l3/common/utils.py @@ -18,7 +18,7 @@ import socket import struct -from netaddr import IPNetwork, IPAddress +from netaddr import IPAddress, IPNetwork, IPRange, IPSet from oslo_log import log as logging @@ -51,6 +51,18 @@ def get_router_ports(router): return router_ports +def determine_external_interface_ip_with_nat_pool(external_fixed_ips, dynamic_nat_pool): + if dynamic_nat_pool is None: + return ValueError("dynamic_nat_pool cannot be None") + + ips, _ = dynamic_nat_pool.split("/") + start_ip, end_ip = ips.split("-") + ip_pool = IPSet(IPRange(start_ip, end_ip)) + for n_fixed_ip in external_fixed_ips: + if n_fixed_ip['ip_address'] not in ip_pool: + return n_fixed_ip + + def uuid_to_vrf_id(uuid): return uuid.replace('-', '') diff --git a/asr1k_neutron_l3/models/netconf_yang/l3_interface.py b/asr1k_neutron_l3/models/netconf_yang/l3_interface.py index 812debda..41e68e1a 100644 --- a/asr1k_neutron_l3/models/netconf_yang/l3_interface.py +++ b/asr1k_neutron_l3/models/netconf_yang/l3_interface.py @@ -121,6 +121,13 @@ def get_all_stub_filter(cls, context): def get_for_vrf(cls, context, vrf=None): return cls._get_all(context=context, xpath_filter=cls.VRF_XPATH_FILTER.format(vrf=vrf, iftype=context.bd_iftype)) + + def _preflight_cleanup_needed(self, context): + BD_VIF_IP_FILTER = f'/native/interface/BD-VIF/ip[address/primary/address="{self.ip_address}"]' + bd_vifs = cls._get_all(context=context, xpath_filter=BD_VIF_IP_FILTER) + + return len(bd_vifs) > 1 + @classmethod def __parameters__(cls): @@ -151,6 +158,7 @@ def __parameters__(cls): ] def __init__(self, **kwargs): + self.preempt_ip_cleanup = kwargs.pop('preempt_ip_cleanup', True) super(VBInterface, self).__init__(**kwargs) if int(self.mtu) < self.MIN_MTU: self.mtu = str(self.MIN_MTU) @@ -176,6 +184,8 @@ def preflight(self, context): if bdi: LOG.debug("Removing BDI%s from %s before adding new BD-VIF%s", bdi.name, context.host, self.name) bdi._delete(context=nobdvif_context, postflight=False) # disable postflight checks + + if self.preempt_ip_cleanup and self._preflight_cleanup_needed() def to_dict(self, context): vbi = OrderedDict() diff --git a/asr1k_neutron_l3/models/neutron/l3/interface.py b/asr1k_neutron_l3/models/neutron/l3/interface.py index cc9bd33f..7fb4f290 100644 --- a/asr1k_neutron_l3/models/neutron/l3/interface.py +++ b/asr1k_neutron_l3/models/neutron/l3/interface.py @@ -124,8 +124,9 @@ def enable_nat(self): class GatewayInterface(Interface): - def __init__(self, router_id, router_port, extra_atts, dynamic_nat_pool): + def __init__(self, router_id, router_port, extra_atts, dynamic_nat_pool, preempt_ip_cleanup): self.dynamic_nat_pool = dynamic_nat_pool + self.preempt_ip_cleanup = preempt_ip_cleanup super(GatewayInterface, self).__init__(router_id, router_port, extra_atts) self.nat_address = self._nat_address() @@ -139,28 +140,25 @@ def __init__(self, router_id, router_port, extra_atts, dynamic_nat_pool): ip_address=self.ip_address, secondary_ip_addresses=self.secondary_ip_addresses, nat_outside=True, redundancy_group=None, route_map='EXT-TOS', access_group_out='EXT-TOS', - ntp_disable=True, arp_timeout=cfg.CONF.asr1k_l3.external_iface_arp_timeout) + ntp_disable=True, arp_timeout=cfg.CONF.asr1k_l3.external_iface_arp_timeout + preempt_ip_cleanup=self.preempt_ip_cleanup) def _ip_address(self): if self.dynamic_nat_pool is None or not self.router_port.get('fixed_ips'): return super()._ip_address() - - ips, _ = self.dynamic_nat_pool.split("/") - start_ip, end_ip = ips.split("-") - ip_pool = netaddr.IPSet(netaddr.IPRange(start_ip, end_ip)) - for n_fixed_ip in self.router_port['fixed_ips']: - if n_fixed_ip['ip_address'] not in ip_pool: - break - else: + + fixed_ip = utils.determine_external_interface_ip_with_nat_pool(self.router_port.get('fixed_ips'), + self.dynamic_nat_pool) + + if not fixed_ip: LOG.error("VRF %s gateway interface has no IP that is not part of dynamic NAT pool %s, " "not configuring primary IP", self.vrf, self.dynamic_nat_pool) return None - - self._primary_subnet_id = n_fixed_ip.get('subnet_id') - - return VBIPrimaryIpAddress(address=n_fixed_ip['ip_address'], - mask=utils.to_netmask(n_fixed_ip.get('prefixlen'))) + + self._primary_subnet_id = fixed_ip.get('subnet_id') + return VBIPrimaryIpAddress(address=fixed_ip['ip_address'], + mask=utils.to_netmask(fixed_ip.get('prefixlen'))) def _nat_address(self): ips = self.router_port.get('fixed_ips') diff --git a/asr1k_neutron_l3/models/neutron/l3/router.py b/asr1k_neutron_l3/models/neutron/l3/router.py index 603f0296..d731a59b 100644 --- a/asr1k_neutron_l3/models/neutron/l3/router.py +++ b/asr1k_neutron_l3/models/neutron/l3/router.py @@ -168,7 +168,7 @@ def _build_routes(self): primary_route = self._primary_route() if not primary_overridden and primary_route is not None and self._route_has_connected_interface(primary_route): routes.append(primary_route) - + return routes def _build_nat_acl(self): diff --git a/asr1k_neutron_l3/plugins/db/asr1k_db.py b/asr1k_neutron_l3/plugins/db/asr1k_db.py index 02b5779a..06eb5eef 100644 --- a/asr1k_neutron_l3/plugins/db/asr1k_db.py +++ b/asr1k_neutron_l3/plugins/db/asr1k_db.py @@ -560,6 +560,25 @@ def get_floating_ips_with_router_macs(self, context, fips=None, router_id=None): query = query.filter(l3_models.Router.id == router_id) return {e.floating_ip_address: e.mac_address for e in query} + + def preempt_external_ip_cleanup(self, context, router_id, ip, host): + # The current firmware mal-allocates an external interfaces NAT pool (the pointer to the hash-table, I presume) + # if there is an external interface with the same IP of another router on the same agent/host. + # Never the less, this could still happen in reality as multiple address scopes could lead to this being + # a completely legal condition, even though for now it is unlikely to be seen in out environment. + # This function will cover for this condition, it will indicate if such a case is prevalent for the IP given. + query = context.session.query(models_v2.Port) + query = query.join(l3agent_models.RouterL3AgentBinding, + l3agent_models.RouterL3AgentBinding.router_id == models_v2.Port.device_id) + query = query.join(agent_model.Agent, + l3agent_models.RouterL3AgentBinding.l3_agent_id == agent_model.Agent.id) + query = query.filter(agent_model.Agent.host == host) + query = query.join(models_v2.IPAllocation, + models_v2.IPAllocation.port_id == models_v2.Port.id) + query = query.filter(models_v2.Port.device_owner == n_constants.DEVICE_OWNER_ROUTER_GW) + query = query.filter(models_v2.IPAllocation.ip_address == ip) + query = query.filter(l3agent_models.RouterL3AgentBinding.router_id != router_id) + return bool(query.count()) def ensure_router_atts(self, context, router_id): with db_api.CONTEXT_WRITER.using(context): diff --git a/asr1k_neutron_l3/plugins/l3/agents/asr1k_l3_agent.py b/asr1k_neutron_l3/plugins/l3/agents/asr1k_l3_agent.py index e4107de5..f0408c31 100644 --- a/asr1k_neutron_l3/plugins/l3/agents/asr1k_l3_agent.py +++ b/asr1k_neutron_l3/plugins/l3/agents/asr1k_l3_agent.py @@ -264,7 +264,6 @@ def get_floating_ips_with_router_macs(self, context, fips=None, router_id=None): cctxt = self.client.prepare() return cctxt.call(context, 'get_floating_ips_with_router_macs', fips=fips, router_id=router_id) - class L3ASRAgent(manager.Manager, operations.OperationsMixin, DeviceCleanerMixin): """Manager for L3 ASR Agent diff --git a/asr1k_neutron_l3/plugins/l3/service_plugins/l3_extension_adapter.py b/asr1k_neutron_l3/plugins/l3/service_plugins/l3_extension_adapter.py index d18dbf57..dab706f6 100644 --- a/asr1k_neutron_l3/plugins/l3/service_plugins/l3_extension_adapter.py +++ b/asr1k_neutron_l3/plugins/l3/service_plugins/l3_extension_adapter.py @@ -289,6 +289,16 @@ def get_sync_data(self, context, router_ids=None, active=None, host=None): router["rt_export"] = list(set(rt_export)) router["rt_import"] = list(set(rt_import)) + dynamic_nat_pool = router_att.get('dynamic_nat_pool') + gw_port = router.get('gw_port') + if dynamic_nat_pool: + fixed_ip = utils.determine_external_interface_ip_with_nat_pool(gw_port, dynamic_nat_pool) + elif len(gw_port.get('fixed_ips', [])) > 1: + LOG.error("Router %s has more than one external fixed IP but no NAT pool" % router['id']) + else: + fixed_ip = gw_port.get('fixed_ips')[0].get('ip_address') + router['preempt_external_ip_cleanup'] = self.db.preempt_external_ip_cleanup(context, router['id'], fixed_ip,host) + return routers def get_deleted_router_atts(self, context):