From 56dcd1a8778a4b63fb18be27685a2bc2ceab37f6 Mon Sep 17 00:00:00 2001 From: Henri Rosten Date: Tue, 1 Oct 2024 08:30:23 +0300 Subject: [PATCH] Fix DataFrameDiskCache locking Do not throw if `sbomnix` is executed concurrently. Signed-off-by: Henri Rosten --- nix/packages.nix | 2 ++ setup.py | 1 + src/common/utils.py | 5 ---- src/sbomnix/cpe.py | 20 ++----------- src/sbomnix/dfcache.py | 50 ++++++++++++++++++++++++++++++++ src/sbomnix/meta.py | 65 ++++++++++++++++++++++++------------------ 6 files changed, 93 insertions(+), 50 deletions(-) create mode 100644 src/sbomnix/dfcache.py diff --git a/nix/packages.nix b/nix/packages.nix index c53538b..5cddc15 100644 --- a/nix/packages.nix +++ b/nix/packages.nix @@ -198,6 +198,7 @@ ++ (with pp; [ beautifulsoup4 colorlog + filelock graphviz numpy packageurl-python @@ -226,6 +227,7 @@ (with ps; [ beautifulsoup4 colorlog + filelock graphviz numpy packageurl-python diff --git a/setup.py b/setup.py index 5fb4634..8670519 100644 --- a/setup.py +++ b/setup.py @@ -25,6 +25,7 @@ def project_path(*names): requires = [ "beautifulsoup4", "colorlog", + "filelock", "graphviz", "numpy", "pandas", diff --git a/src/common/utils.py b/src/common/utils.py index cfa9189..48b1b42 100644 --- a/src/common/utils.py +++ b/src/common/utils.py @@ -14,8 +14,6 @@ import logging import subprocess import importlib.metadata -import pathlib -import tempfile import urllib.error from shutil import which @@ -33,9 +31,6 @@ LOG_SPAM = logging.DEBUG - 1 LOG = logging.getLogger(os.path.abspath(__file__)) -# DataFrameDiskCache cache local path -DFCACHE_PATH = pathlib.Path(tempfile.gettempdir()) / "sbomnix_df_cache" - ############################################################################### diff --git a/src/sbomnix/cpe.py b/src/sbomnix/cpe.py index 082640b..579d2bc 100644 --- a/src/sbomnix/cpe.py +++ b/src/sbomnix/cpe.py @@ -8,13 +8,10 @@ import sys import string -import time -from sqlite3 import OperationalError -from dfdiskcache import DataFrameDiskCache +from sbomnix.dfcache import LockedDfCache from common.utils import ( LOG, LOG_SPAM, - DFCACHE_PATH, df_from_csv_file, df_log, ) @@ -33,7 +30,7 @@ class CPE: """Generate Common Platform Enumeration identifiers""" def __init__(self): - self.cache = DataFrameDiskCache(cache_dir_path=DFCACHE_PATH) + self.cache = LockedDfCache() self.df_cpedict = self.cache.get(_CPE_CSV_URL) if self.df_cpedict is not None and not self.df_cpedict.empty: LOG.debug("read CPE dictionary from cache") @@ -45,18 +42,7 @@ def __init__(self): "Failed downloading cpedict: CPE information might not be accurate" ) else: - waiting = True - while waiting: - try: - self.cache.set( - _CPE_CSV_URL, self.df_cpedict, ttl=_CPE_CSV_CACHE_TTL - ) - waiting = False - except OperationalError: - LOG.warning( - "CPE Sqlite database is locked! Retrying in 1 second" - ) - time.sleep(1) + self.cache.set(_CPE_CSV_URL, self.df_cpedict, ttl=_CPE_CSV_CACHE_TTL) if self.df_cpedict is not None: # Verify the loaded cpedict contains at least the following columns diff --git a/src/sbomnix/dfcache.py b/src/sbomnix/dfcache.py new file mode 100644 index 0000000..a653c5a --- /dev/null +++ b/src/sbomnix/dfcache.py @@ -0,0 +1,50 @@ +# SPDX-FileCopyrightText: 2022-2024 Technology Innovation Institute (TII) +# +# SPDX-License-Identifier: Apache-2.0 + +# pylint: disable=too-few-public-methods + +"""Thread-safe DataFrameDiskCache""" + +import pathlib +import tempfile +from getpass import getuser + +from filelock import FileLock +from dfdiskcache import DataFrameDiskCache + +############################################################################### + +# DataFrameDiskCache cache local path and lock file +DFCACHE_PATH = pathlib.Path(tempfile.gettempdir()) / f"{getuser()}_sbomnix_df_cache" +DFCACHE_LOCK = DFCACHE_PATH / "dfcache.lock" + +################################################################################ + + +class LockedDfCache: + """Thread-safe (and process-safe) wrapper for DataFrameDiskCache""" + + def __init__(self): + self.dflock = FileLock(DFCACHE_LOCK) + + def __getattr__(self, name): + + def wrap(*a, **k): + with self.dflock: + # We intentionally do not store the dfcache as object variable + # but re-instantiate it every time any LockedDfCache method + # is called. DataFrameDiskCache internally makes use of sqlite + # which does not allow concurrent connections to the database. + # Having the dfcache initiated once in __init__() and then + # re-used here would mean the connection would remain reserved + # for the first thread making other threads throw with + # 'database locked' etc. even if we otherwise protect + # concurrent writes. + dfcache = DataFrameDiskCache(cache_dir_path=DFCACHE_PATH) + return getattr(dfcache, name)(*a, **k) + + return wrap + + +############################################################################### diff --git a/src/sbomnix/meta.py b/src/sbomnix/meta.py index bd47fc4..850c1c0 100644 --- a/src/sbomnix/meta.py +++ b/src/sbomnix/meta.py @@ -6,16 +6,18 @@ """Cache nixpkgs meta information""" -import time import os import re import logging +import pathlib +import tempfile +from getpass import getuser -from sqlite3 import OperationalError import pandas as pd -from dfdiskcache import DataFrameDiskCache +from filelock import FileLock +from sbomnix.dfcache import LockedDfCache from nixmeta.scanner import NixMetaScanner, nixref_to_nixpkgs_path -from common.utils import LOG, df_from_csv_file, df_to_csv_file, DFCACHE_PATH +from common.utils import LOG, df_from_csv_file, df_to_csv_file ############################################################################### @@ -28,6 +30,9 @@ # is cleaned. _NIXMETA_NIXPKGS_TTL = 60 * 60 * 24 * 30 +# FileLock lock path +_FLOCK = pathlib.Path(tempfile.gettempdir()) / f"{getuser()}_sbomnix_meta.lock" + ############################################################################### @@ -35,15 +40,8 @@ class Meta: """Cache nixpkgs meta information""" def __init__(self): - waiting = True - while waiting: - try: - self.cache = DataFrameDiskCache(cache_dir_path=DFCACHE_PATH) - waiting = False - except OperationalError: - LOG.warning("DFCACHE Sqlite database is locked! Retrying in 1 second") - time.sleep(1) - + self.lock = FileLock(_FLOCK) + self.cache = LockedDfCache() # df_nixmeta includes the meta-info from _NIXMETA_CSV_URL self.df_nixmeta = self.cache.get(_NIXMETA_CSV_URL) if self.df_nixmeta is not None and not self.df_nixmeta.empty: @@ -103,22 +101,33 @@ def get_nixpkgs_meta(self, nixref=None): return df_concat def _scan(self, nixpkgs_path): - df = self.cache.get(nixpkgs_path) - if df is not None and not df.empty: - LOG.debug("found from cache: %s", nixpkgs_path) + # In case sbomnix is run concurrently, we want to make sure there's + # only one instance of NixMetaScanner.scan() running at a time. + # The reason is, NixMetaScanner.scan() potentially invokes + # `nix-env -qa --meta --json -f /path/to/nixpkgs` which is very + # memory intensive. The locking needs to happen here (and not in + # NixMetaScanner) because this object caches the nixmeta info. + # First scan generates the cache, after which the consecutive scans + # will read the scan results from the cache, not having to run + # the nix-env command again, making the consecutive scans relatively + # fast and light-weight. + with self.lock: + df = self.cache.get(nixpkgs_path) + if df is not None and not df.empty: + LOG.debug("found from cache: %s", nixpkgs_path) + return df + LOG.debug("cache miss, scanning: %s", nixpkgs_path) + scanner = NixMetaScanner() + scanner.scan(nixpkgs_path) + df = scanner.to_df() + if df is None or df.empty: + LOG.warning("Failed scanning nixmeta: %s", nixpkgs_path) + return None + # Cache requires some TTL, so we set it to some value here. + # Although, we could as well store it indefinitely as it should + # not change given the same key (nixpkgs store path). + self.cache.set(key=nixpkgs_path, value=df, ttl=_NIXMETA_NIXPKGS_TTL) return df - LOG.debug("cache miss, scanning: %s", nixpkgs_path) - scanner = NixMetaScanner() - scanner.scan(nixpkgs_path) - df = scanner.to_df() - if df is None or df.empty: - LOG.warning("Failed scanning nixmeta: %s", nixpkgs_path) - return None - # Cache requires some TTL, so we set it to some value here. - # Although, we could as well store it indefinitely as it should - # not change given the same key (nixpkgs store path). - self.cache.set(key=nixpkgs_path, value=df, ttl=_NIXMETA_NIXPKGS_TTL) - return df ###############################################################################