Skip to content

Commit

Permalink
Fix DataFrameDiskCache locking
Browse files Browse the repository at this point in the history
Do not throw if `sbomnix` is executed concurrently.

Signed-off-by: Henri Rosten <[email protected]>
  • Loading branch information
henrirosten committed Oct 7, 2024
1 parent 102f197 commit 56dcd1a
Show file tree
Hide file tree
Showing 6 changed files with 93 additions and 50 deletions.
2 changes: 2 additions & 0 deletions nix/packages.nix
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,7 @@
++ (with pp; [
beautifulsoup4
colorlog
filelock
graphviz
numpy
packageurl-python
Expand Down Expand Up @@ -226,6 +227,7 @@
(with ps; [
beautifulsoup4
colorlog
filelock
graphviz
numpy
packageurl-python
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def project_path(*names):
requires = [
"beautifulsoup4",
"colorlog",
"filelock",
"graphviz",
"numpy",
"pandas",
Expand Down
5 changes: 0 additions & 5 deletions src/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@
import logging
import subprocess
import importlib.metadata
import pathlib
import tempfile
import urllib.error
from shutil import which

Expand All @@ -33,9 +31,6 @@
LOG_SPAM = logging.DEBUG - 1
LOG = logging.getLogger(os.path.abspath(__file__))

# DataFrameDiskCache cache local path
DFCACHE_PATH = pathlib.Path(tempfile.gettempdir()) / "sbomnix_df_cache"

###############################################################################


Expand Down
20 changes: 3 additions & 17 deletions src/sbomnix/cpe.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,10 @@

import sys
import string
import time
from sqlite3 import OperationalError
from dfdiskcache import DataFrameDiskCache
from sbomnix.dfcache import LockedDfCache
from common.utils import (
LOG,
LOG_SPAM,
DFCACHE_PATH,
df_from_csv_file,
df_log,
)
Expand All @@ -33,7 +30,7 @@ class CPE:
"""Generate Common Platform Enumeration identifiers"""

def __init__(self):
self.cache = DataFrameDiskCache(cache_dir_path=DFCACHE_PATH)
self.cache = LockedDfCache()
self.df_cpedict = self.cache.get(_CPE_CSV_URL)
if self.df_cpedict is not None and not self.df_cpedict.empty:
LOG.debug("read CPE dictionary from cache")
Expand All @@ -45,18 +42,7 @@ def __init__(self):
"Failed downloading cpedict: CPE information might not be accurate"
)
else:
waiting = True
while waiting:
try:
self.cache.set(
_CPE_CSV_URL, self.df_cpedict, ttl=_CPE_CSV_CACHE_TTL
)
waiting = False
except OperationalError:
LOG.warning(
"CPE Sqlite database is locked! Retrying in 1 second"
)
time.sleep(1)
self.cache.set(_CPE_CSV_URL, self.df_cpedict, ttl=_CPE_CSV_CACHE_TTL)

if self.df_cpedict is not None:
# Verify the loaded cpedict contains at least the following columns
Expand Down
50 changes: 50 additions & 0 deletions src/sbomnix/dfcache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# SPDX-FileCopyrightText: 2022-2024 Technology Innovation Institute (TII)
#
# SPDX-License-Identifier: Apache-2.0

# pylint: disable=too-few-public-methods

"""Thread-safe DataFrameDiskCache"""

import pathlib
import tempfile
from getpass import getuser

from filelock import FileLock
from dfdiskcache import DataFrameDiskCache

###############################################################################

# DataFrameDiskCache cache local path and lock file
DFCACHE_PATH = pathlib.Path(tempfile.gettempdir()) / f"{getuser()}_sbomnix_df_cache"
DFCACHE_LOCK = DFCACHE_PATH / "dfcache.lock"

################################################################################


class LockedDfCache:
"""Thread-safe (and process-safe) wrapper for DataFrameDiskCache"""

def __init__(self):
self.dflock = FileLock(DFCACHE_LOCK)

def __getattr__(self, name):

def wrap(*a, **k):
with self.dflock:
# We intentionally do not store the dfcache as object variable
# but re-instantiate it every time any LockedDfCache method
# is called. DataFrameDiskCache internally makes use of sqlite
# which does not allow concurrent connections to the database.
# Having the dfcache initiated once in __init__() and then
# re-used here would mean the connection would remain reserved
# for the first thread making other threads throw with
# 'database locked' etc. even if we otherwise protect
# concurrent writes.
dfcache = DataFrameDiskCache(cache_dir_path=DFCACHE_PATH)
return getattr(dfcache, name)(*a, **k)

return wrap


###############################################################################
65 changes: 37 additions & 28 deletions src/sbomnix/meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,18 @@

"""Cache nixpkgs meta information"""

import time
import os
import re
import logging
import pathlib
import tempfile
from getpass import getuser

from sqlite3 import OperationalError
import pandas as pd
from dfdiskcache import DataFrameDiskCache
from filelock import FileLock
from sbomnix.dfcache import LockedDfCache
from nixmeta.scanner import NixMetaScanner, nixref_to_nixpkgs_path
from common.utils import LOG, df_from_csv_file, df_to_csv_file, DFCACHE_PATH
from common.utils import LOG, df_from_csv_file, df_to_csv_file

###############################################################################

Expand All @@ -28,22 +30,18 @@
# is cleaned.
_NIXMETA_NIXPKGS_TTL = 60 * 60 * 24 * 30

# FileLock lock path
_FLOCK = pathlib.Path(tempfile.gettempdir()) / f"{getuser()}_sbomnix_meta.lock"

###############################################################################


class Meta:
"""Cache nixpkgs meta information"""

def __init__(self):
waiting = True
while waiting:
try:
self.cache = DataFrameDiskCache(cache_dir_path=DFCACHE_PATH)
waiting = False
except OperationalError:
LOG.warning("DFCACHE Sqlite database is locked! Retrying in 1 second")
time.sleep(1)

self.lock = FileLock(_FLOCK)
self.cache = LockedDfCache()
# df_nixmeta includes the meta-info from _NIXMETA_CSV_URL
self.df_nixmeta = self.cache.get(_NIXMETA_CSV_URL)
if self.df_nixmeta is not None and not self.df_nixmeta.empty:
Expand Down Expand Up @@ -103,22 +101,33 @@ def get_nixpkgs_meta(self, nixref=None):
return df_concat

def _scan(self, nixpkgs_path):
df = self.cache.get(nixpkgs_path)
if df is not None and not df.empty:
LOG.debug("found from cache: %s", nixpkgs_path)
# In case sbomnix is run concurrently, we want to make sure there's
# only one instance of NixMetaScanner.scan() running at a time.
# The reason is, NixMetaScanner.scan() potentially invokes
# `nix-env -qa --meta --json -f /path/to/nixpkgs` which is very
# memory intensive. The locking needs to happen here (and not in
# NixMetaScanner) because this object caches the nixmeta info.
# First scan generates the cache, after which the consecutive scans
# will read the scan results from the cache, not having to run
# the nix-env command again, making the consecutive scans relatively
# fast and light-weight.
with self.lock:
df = self.cache.get(nixpkgs_path)
if df is not None and not df.empty:
LOG.debug("found from cache: %s", nixpkgs_path)
return df
LOG.debug("cache miss, scanning: %s", nixpkgs_path)
scanner = NixMetaScanner()
scanner.scan(nixpkgs_path)
df = scanner.to_df()
if df is None or df.empty:
LOG.warning("Failed scanning nixmeta: %s", nixpkgs_path)
return None
# Cache requires some TTL, so we set it to some value here.
# Although, we could as well store it indefinitely as it should
# not change given the same key (nixpkgs store path).
self.cache.set(key=nixpkgs_path, value=df, ttl=_NIXMETA_NIXPKGS_TTL)
return df
LOG.debug("cache miss, scanning: %s", nixpkgs_path)
scanner = NixMetaScanner()
scanner.scan(nixpkgs_path)
df = scanner.to_df()
if df is None or df.empty:
LOG.warning("Failed scanning nixmeta: %s", nixpkgs_path)
return None
# Cache requires some TTL, so we set it to some value here.
# Although, we could as well store it indefinitely as it should
# not change given the same key (nixpkgs store path).
self.cache.set(key=nixpkgs_path, value=df, ttl=_NIXMETA_NIXPKGS_TTL)
return df


###############################################################################

0 comments on commit 56dcd1a

Please sign in to comment.