feat(workflows): Add Required Workflows for Build, Code Quality and C…

…ompatibility Signed-off-by: Kaushlendra Pratap <[email protected]>
fossology · Jan 23, 2025 · a069b2e · a069b2e
1 parent 70d0624
commit a069b2e
Show file tree

Hide file tree

Showing 7 changed files with 169 additions and 28 deletions.
diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml
@@ -0,0 +1,34 @@
+# SPDX-FileCopyrightText: © 2021 Siemens AG
+# SPDX-FileCopyrightText: © Kaushlendra Pratap Singh <[email protected]>
+#
+#SPDX-License-Identifier: LGPL-2.1-only
+
+name: Build Tests
+
+on:
+  pull_request:
+    branches:[main]
+
+jobs:
+  build:
+    name: Build Test
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+
+      - name: Install build tools
+        run: |
+          python -m pip install --upgrade pip
+          pip install setuptools wheel
+
+      - name: Build package
+        run: |
+          python setup.py sdist bdist_wheel
+        working-directory: ./Safaa
diff --git a/.github/workflows/code-quality.yml b/.github/workflows/code-quality.yml
@@ -0,0 +1,36 @@
+# SPDX-FileCopyrightText: © 2021 Siemens AG
+# SPDX-FileCopyrightText: © Kaushlendra Pratap Singh <[email protected]>
+#
+#SPDX-License-Identifier: LGPL-2.1-only
+name: Code Quality
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+jobs:
+  lint:
+    name: Run flake8
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.8'
+          architecture: 'x64'
+
+      - name: Install flake8
+        run: |
+          python -m pip install --upgrade pip
+          pip install flake8
+
+      - name: Run flake8
+        run: |
+          flake8 --max-line-length=120 .
+        working-directory: ./Safaa
diff --git a/.github/workflows/reuse-lint.yml b/.github/workflows/reuse-lint.yml
@@ -11,9 +11,9 @@ concurrency:
 
 on:
   push:
-    branches: [master]
+    branches: [main]
   pull_request:
-    branches: [master]
+    branches: [main]
   workflow_dispatch:
 
 jobs:

diff --git a/.github/workflows/version-compatibility.yml b/.github/workflows/version-compatibility.yml
@@ -0,0 +1,39 @@
+# SPDX-FileCopyrightText: © 2021 Siemens AG
+# SPDX-FileCopyrightText: © Kaushlendra Pratap Singh <[email protected]>
+#
+#SPDX-License-Identifier: LGPL-2.1-only
+
+name: Compatibility Tests
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+jobs:
+  test:
+    name: Test on Python ${{ matrix.python-version }}
+    runs-on: ubuntu-latest
+
+    strategy:
+      fail-fast: false
+      matrix:
+        ## Allowed Python versions https://github.com/actions/runner-images/blob/main/images/ubuntu/Ubuntu2404-Readme.md#python
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+          cache: 'pip' # caching pip dependencies
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python setup.py install
+        working-directory: ./Safaa
diff --git a/Safaa/setup.py b/Safaa/setup.py
@@ -7,37 +7,44 @@
 
 here = path.dirname(path.abspath(path.dirname(__file__)))
 # fetch the long description from the README.md
-with open(path.join(here, 'README.md'), encoding='utf-8') as f:
+with open(path.join(here, "README.md"), encoding="utf-8") as f:
     long_description = f.read()
 
 setup(
-    name='safaa',
-    version='0.0.1',
-    url='https://github.com/fossology/safaa',
-    author='Abdelrahman Jamal',
-    author_email='[email protected]',
+    name="safaa",
+    version="0.0.1",
+    url="https://github.com/fossology/safaa",
+    author="Abdelrahman Jamal",
+    author_email="[email protected]",
     description="""Created as a part of the 2023 Google Summer of Code project:
      Reducing Fossology\'s False Positive Copyrights, the purpose is to be able
      to predict whether a given copyright output from the Fossology software
      is a false positive or not. It is also able to remove extra
      text from a copyright notice.""",
     long_description=long_description,
-    long_description_content_type='text/markdown',
-    packages=find_packages(where='src', ),
+    long_description_content_type="text/markdown",
+    packages=find_packages(
+        where="src",
+    ),
     package_dir={"": "src"},
     install_requires=[
-        'spacy>=3.0.0',
-        'joblib>=1.0.0',
-        'pandas>=1.1.0',
-        'scikit-learn>=1.3.0',
+        "spacy>=3.0.0",
+        "joblib>=1.0.0",
+        "pandas>=1.1.0",
+        "scikit-learn>=1.3.0",
     ],
     classifiers=[
-        'Programming Language :: Python :: 3',
-        'License :: OSI Approved :: GNU Lesser General Public License v2 (LGPLv2)',
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: GNU Lesser General Public License v2 (LGPLv2)",
     ],
     include_package_data=True,
     include_dirs=[],
-    package_data={'': ['src/safaa/models/*.pkl', 'src/safaa/models/*.',
-                       'src/safaa/configs/*']},
-    python_requires='>=3.6',
+    package_data={
+        "": [
+            "src/safaa/models/*.pkl",
+            "src/safaa/models/*.",
+            "src/safaa/configs/*",
+        ]
+    },
+    python_requires=">=3.6",
 )
diff --git a/Safaa/src/safaa/Safaa.py b/Safaa/src/safaa/Safaa.py
@@ -47,7 +47,9 @@ def __init__(self, use_local_model=True, model_dir=None):
         self.vectorizer_path = os.path.join(
             model_dir, "false_positive_detection_vectorizer.pkl"
         )
-        self.entity_recognizer_path = os.path.join(model_dir, "entity_recognizer")
+        self.entity_recognizer_path = os.path.join(
+            model_dir, "entity_recognizer"
+        )
         self.declutter_model_path = os.path.join(model_dir, "declutter_model")
 
         # Load the models from the constructed file paths
@@ -172,14 +174,24 @@ def _perform_text_substitutions(self, data):
             (r"\(c\)", " COPYRIGHTSYMBOL "),
             (r"\(C\)", " COPYRIGHTSYMBOL "),
             (
-                r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])""",
+                r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"
+                (?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|
+                \\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@
+                (?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9]
+                (?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|
+                1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|
+                1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(
+                [\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|
+                \\[\x01-\x09\x0b\x0c\x0e-\x7f])+)])""",
                 " EMAIL ",
             ),
             (r"[^a-zA-Z0-9]", " "),
         ]
         # Perform the substitutions for each pattern in the list
         for pattern, replacement in subs:
-            data = [re.sub(pattern, replacement, sentence) for sentence in data]
+            data = [
+                re.sub(pattern, replacement, sentence) for sentence in data
+            ]
         # Convert text to lowercase and strip extra whitespace
         return [sentence.lower().strip() for sentence in data]
 
@@ -209,7 +221,8 @@ def predict(self, data, threshold=0.5):
             # Classify based on the given threshold. If the threshhold is not
             # met, automatically sets the prediction to true
             return [
-                "f" if prediction[1] >= threshold else "t" for prediction in predictions
+                "f" if prediction[1] >= threshold else "t"
+                for prediction in predictions
             ]
 
         # Get binary predictions from the model if probability prediction is not
@@ -239,7 +252,9 @@ def declutter(self, data, predictions):
             (
                 ""
                 if prediction == "f"
-                else " ".join([ent.text for ent in self.declutter_model(sentence).ents])
+                else " ".join(
+                    [ent.text for ent in self.declutter_model(sentence).ents]
+                )
             )
             for sentence, prediction in zip(data, predictions)
         ]
@@ -297,20 +312,25 @@ def train_ner_model(
 
         # Determine the model directory paths
         tmp_model_path = os.path.join(LOCAL_MODEL_DIR, "tmp")
-        new_model_dir = "declutter_model" if declutter_model else "entity_recognizer"
+        new_model_dir = (
+            "declutter_model" if declutter_model else "entity_recognizer"
+        )
         new_model_path = os.path.join(LOCAL_MODEL_DIR, new_model_dir)
 
         # Create the new model directory if it doesn't exist
         os.makedirs(new_model_path, exist_ok=True)
 
         # Construct the training command and execute it
         train_command = (
-            f"python -m spacy train '{tmp_cfg_path}' " f"--output '{tmp_model_path}'"
+            f"python -m spacy train '{tmp_cfg_path}' "
+            f"--output '{tmp_model_path}'"
         )
         os.system(train_command)
 
         # Move the trained model files to the new model directory
-        self._move_files(os.path.join(tmp_model_path, "model-best"), new_model_path)
+        self._move_files(
+            os.path.join(tmp_model_path, "model-best"), new_model_path
+        )
 
         # Clean up the temporary files and directories
         os.remove(tmp_cfg_path)
@@ -355,7 +375,8 @@ def save(self, path=None):
         # Check directory permissions
         if not os.access(path, os.W_OK):
             print(
-                "Write permissions are not granted for the directory: " f"{save_path}"
+                "Write permissions are not granted for the directory: "
+                f"{save_path}"
             )
             return
 

diff --git a/Safaa/src/safaa/models/false_positive_detection_model_sgd.pkl.license b/Safaa/src/safaa/models/false_positive_detection_model_sgd.pkl.license
@@ -0,0 +1,4 @@
+# SPDX-FileCopyrightText: © 2021 Siemens AG
+# SPDX-FileCopyrightText: © Kaushlendra Pratap Singh <[email protected]>
+#
+#SPDX-License-Identifier: LGPL-2.1-only