Skip to content

Commit

Permalink
feat(workflows): Add Required Workflows for Build, Code Quality and C…
Browse files Browse the repository at this point in the history
…ompatibility

Signed-off-by: Kaushlendra Pratap <[email protected]>
  • Loading branch information
Kaushl2208 committed Jan 23, 2025
1 parent 70d0624 commit a069b2e
Show file tree
Hide file tree
Showing 7 changed files with 169 additions and 28 deletions.
34 changes: 34 additions & 0 deletions .github/workflows/build-test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# SPDX-FileCopyrightText: © 2021 Siemens AG
# SPDX-FileCopyrightText: © Kaushlendra Pratap Singh <[email protected]>
#
#SPDX-License-Identifier: LGPL-2.1-only

name: Build Tests

on:
pull_request:
branches:[main]

jobs:
build:
name: Build Test
runs-on: ubuntu-latest

steps:
- name: Checkout code
uses: actions/checkout@v3

- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.10'

- name: Install build tools
run: |
python -m pip install --upgrade pip
pip install setuptools wheel
- name: Build package
run: |
python setup.py sdist bdist_wheel
working-directory: ./Safaa
36 changes: 36 additions & 0 deletions .github/workflows/code-quality.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# SPDX-FileCopyrightText: © 2021 Siemens AG
# SPDX-FileCopyrightText: © Kaushlendra Pratap Singh <[email protected]>
#
#SPDX-License-Identifier: LGPL-2.1-only
name: Code Quality

on:
push:
branches: [main]
pull_request:
branches: [main]

jobs:
lint:
name: Run flake8
runs-on: ubuntu-latest

steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.8'
architecture: 'x64'

- name: Install flake8
run: |
python -m pip install --upgrade pip
pip install flake8
- name: Run flake8
run: |
flake8 --max-line-length=120 .
working-directory: ./Safaa
4 changes: 2 additions & 2 deletions .github/workflows/reuse-lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@ concurrency:

on:
push:
branches: [master]
branches: [main]
pull_request:
branches: [master]
branches: [main]
workflow_dispatch:

jobs:
Expand Down
39 changes: 39 additions & 0 deletions .github/workflows/version-compatibility.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# SPDX-FileCopyrightText: © 2021 Siemens AG
# SPDX-FileCopyrightText: © Kaushlendra Pratap Singh <[email protected]>
#
#SPDX-License-Identifier: LGPL-2.1-only

name: Compatibility Tests

on:
push:
branches: [main]
pull_request:
branches: [main]

jobs:
test:
name: Test on Python ${{ matrix.python-version }}
runs-on: ubuntu-latest

strategy:
fail-fast: false
matrix:
## Allowed Python versions https://github.com/actions/runner-images/blob/main/images/ubuntu/Ubuntu2404-Readme.md#python
python-version: ["3.9", "3.10", "3.11", "3.12"]

steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
cache: 'pip' # caching pip dependencies

- name: Install dependencies
run: |
python -m pip install --upgrade pip
python setup.py install
working-directory: ./Safaa
41 changes: 24 additions & 17 deletions Safaa/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,37 +7,44 @@

here = path.dirname(path.abspath(path.dirname(__file__)))
# fetch the long description from the README.md
with open(path.join(here, 'README.md'), encoding='utf-8') as f:
with open(path.join(here, "README.md"), encoding="utf-8") as f:
long_description = f.read()

setup(
name='safaa',
version='0.0.1',
url='https://github.com/fossology/safaa',
author='Abdelrahman Jamal',
author_email='[email protected]',
name="safaa",
version="0.0.1",
url="https://github.com/fossology/safaa",
author="Abdelrahman Jamal",
author_email="[email protected]",
description="""Created as a part of the 2023 Google Summer of Code project:
Reducing Fossology\'s False Positive Copyrights, the purpose is to be able
to predict whether a given copyright output from the Fossology software
is a false positive or not. It is also able to remove extra
text from a copyright notice.""",
long_description=long_description,
long_description_content_type='text/markdown',
packages=find_packages(where='src', ),
long_description_content_type="text/markdown",
packages=find_packages(
where="src",
),
package_dir={"": "src"},
install_requires=[
'spacy>=3.0.0',
'joblib>=1.0.0',
'pandas>=1.1.0',
'scikit-learn>=1.3.0',
"spacy>=3.0.0",
"joblib>=1.0.0",
"pandas>=1.1.0",
"scikit-learn>=1.3.0",
],
classifiers=[
'Programming Language :: Python :: 3',
'License :: OSI Approved :: GNU Lesser General Public License v2 (LGPLv2)',
"Programming Language :: Python :: 3",
"License :: OSI Approved :: GNU Lesser General Public License v2 (LGPLv2)",
],
include_package_data=True,
include_dirs=[],
package_data={'': ['src/safaa/models/*.pkl', 'src/safaa/models/*.',
'src/safaa/configs/*']},
python_requires='>=3.6',
package_data={
"": [
"src/safaa/models/*.pkl",
"src/safaa/models/*.",
"src/safaa/configs/*",
]
},
python_requires=">=3.6",
)
39 changes: 30 additions & 9 deletions Safaa/src/safaa/Safaa.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,9 @@ def __init__(self, use_local_model=True, model_dir=None):
self.vectorizer_path = os.path.join(
model_dir, "false_positive_detection_vectorizer.pkl"
)
self.entity_recognizer_path = os.path.join(model_dir, "entity_recognizer")
self.entity_recognizer_path = os.path.join(
model_dir, "entity_recognizer"
)
self.declutter_model_path = os.path.join(model_dir, "declutter_model")

# Load the models from the constructed file paths
Expand Down Expand Up @@ -172,14 +174,24 @@ def _perform_text_substitutions(self, data):
(r"\(c\)", " COPYRIGHTSYMBOL "),
(r"\(C\)", " COPYRIGHTSYMBOL "),
(
r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])""",
r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"
(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|
\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@
(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9]
(?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|
1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|
1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(
[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|
\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)])""",
" EMAIL ",
),
(r"[^a-zA-Z0-9]", " "),
]
# Perform the substitutions for each pattern in the list
for pattern, replacement in subs:
data = [re.sub(pattern, replacement, sentence) for sentence in data]
data = [
re.sub(pattern, replacement, sentence) for sentence in data
]
# Convert text to lowercase and strip extra whitespace
return [sentence.lower().strip() for sentence in data]

Expand Down Expand Up @@ -209,7 +221,8 @@ def predict(self, data, threshold=0.5):
# Classify based on the given threshold. If the threshhold is not
# met, automatically sets the prediction to true
return [
"f" if prediction[1] >= threshold else "t" for prediction in predictions
"f" if prediction[1] >= threshold else "t"
for prediction in predictions
]

# Get binary predictions from the model if probability prediction is not
Expand Down Expand Up @@ -239,7 +252,9 @@ def declutter(self, data, predictions):
(
""
if prediction == "f"
else " ".join([ent.text for ent in self.declutter_model(sentence).ents])
else " ".join(
[ent.text for ent in self.declutter_model(sentence).ents]
)
)
for sentence, prediction in zip(data, predictions)
]
Expand Down Expand Up @@ -297,20 +312,25 @@ def train_ner_model(

# Determine the model directory paths
tmp_model_path = os.path.join(LOCAL_MODEL_DIR, "tmp")
new_model_dir = "declutter_model" if declutter_model else "entity_recognizer"
new_model_dir = (
"declutter_model" if declutter_model else "entity_recognizer"
)
new_model_path = os.path.join(LOCAL_MODEL_DIR, new_model_dir)

# Create the new model directory if it doesn't exist
os.makedirs(new_model_path, exist_ok=True)

# Construct the training command and execute it
train_command = (
f"python -m spacy train '{tmp_cfg_path}' " f"--output '{tmp_model_path}'"
f"python -m spacy train '{tmp_cfg_path}' "
f"--output '{tmp_model_path}'"
)
os.system(train_command)

# Move the trained model files to the new model directory
self._move_files(os.path.join(tmp_model_path, "model-best"), new_model_path)
self._move_files(
os.path.join(tmp_model_path, "model-best"), new_model_path
)

# Clean up the temporary files and directories
os.remove(tmp_cfg_path)
Expand Down Expand Up @@ -355,7 +375,8 @@ def save(self, path=None):
# Check directory permissions
if not os.access(path, os.W_OK):
print(
"Write permissions are not granted for the directory: " f"{save_path}"
"Write permissions are not granted for the directory: "
f"{save_path}"
)
return

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# SPDX-FileCopyrightText: © 2021 Siemens AG
# SPDX-FileCopyrightText: © Kaushlendra Pratap Singh <[email protected]>
#
#SPDX-License-Identifier: LGPL-2.1-only

0 comments on commit a069b2e

Please sign in to comment.