From 771b73bff11540c3bb66fdb35cc73fa6c1c7bd9b Mon Sep 17 00:00:00 2001 From: Daemon <109057945+Daethyra@users.noreply.github.com> Date: Mon, 14 Aug 2023 15:37:00 -0700 Subject: [PATCH 01/17] Updated set up files --- project/setup/orchestration/requirements.txt | 26 +----- project/setup/pyproject.toml | 85 ++------------------ 2 files changed, 10 insertions(+), 101 deletions(-) diff --git a/project/setup/orchestration/requirements.txt b/project/setup/orchestration/requirements.txt index 798db88..1dc3f15 100644 --- a/project/setup/orchestration/requirements.txt +++ b/project/setup/orchestration/requirements.txt @@ -1,27 +1,5 @@ -python-dotenv -threadpoolctl -numpy -joblib -scipy -scikit-learn==1.3.0 -pytz -python-stdnum -phonenumbers -tzdata -tqdm -python-dateutil -click -nltk -faker -dateparser -textblob -scrubadub -chardet +scikit-learn tensorflow -keras -pinecone-client pandas -openai -langchain PyPDF2 -xlrd \ No newline at end of file +nltk \ No newline at end of file diff --git a/project/setup/pyproject.toml b/project/setup/pyproject.toml index 200bc17..cfc9aae 100644 --- a/project/setup/pyproject.toml +++ b/project/setup/pyproject.toml @@ -9,83 +9,14 @@ description = "Description" authors = ["Daethyra dev-daethyra@protonmail.com"] [tool.poetry.dependencies] -python = "^3.8" -python-dotenv = "*" -threadpoolctl = "*" -numpy = "*" -joblib = "*" -scipy = "*" -scikit-learn = "1.3.0" -pytz = "*" -python-stdnum = "*" -phonenumbers = "*" -tzdata = "*" -tqdm = "*" -python-dateutil = "*" -click = "*" -nltk = "*" -faker = "*" -dateparser = "*" -textblob = "*" -scrubadub = "*" -chardet = "*" -tensorflow = "*" -keras = "*" -pinecone-client = "*" -pandas = "*" -openai = "*" -langchain = "*" -PyPDF2 = "*" -xlrd = "*" -beautifulsoup4 = ">=4.12.2" -colorama = "0.4.6" -distro = "1.8.0" -playsound = "1.2.2" -pyyaml = "6.0.1" -python-docx = "*" -markdown = "*" -pylatexenc = "*" -readability-lxml = "0.8.1" -requests = "*" -tiktoken = "0.4.0" -gTTS = "2.3.2" -docker = "*" -duckduckgo-search = "^3.8.4" -google-api-python-client = "*" -redis = "*" -orjson = "3.8.10" -Pillow = "*" -selenium = "4.11.2" -webdriver-manager = "*" -jsonschema = "*" -charset-normalizer = ">=3.1.0" -spacy = ">=3.0.0,<4.0.0" -prompt-toolkit = ">=3.0.38" -pydantic = "*" -inflection = "*" -fastapi = "*" -uvicorn = "*" -coverage = "*" -flake8 = "*" -pre-commit = "*" -black = "*" -isort = "*" -gitpython = "3.1.32" -mkdocs = "*" -pymdown-extensions = "*" -mypy = "*" -types-Markdown = "*" -types-beautifulsoup4 = "*" -types-colorama = "*" -types-Pillow = "*" -openapi-python-client = "0.15.0" -pytest = "*" -asynctest = "*" -pytest-asyncio = "*" -pytest-benchmark = "*" -pytest-cov = "*" -pytest-integration = "*" -pytest-mock = "*" +[tool.poetry.dependencies] +python = "^3.11.3" +scikit-learn = "^0.24.2" +tensorflow = "^2.6.0" +pandas = "^1.3.3" +python-dotenv = "^0.19.1" +PyPDF2 = "^2.3.1" +nltk = "^3.8.1" [tool.poetry.dev-dependencies] From eba9bf4d2b6c5a3d1aaa263767475ecc0933e23c Mon Sep 17 00:00:00 2001 From: Daemon <109057945+Daethyra@users.noreply.github.com> Date: Mon, 14 Aug 2023 18:24:11 -0700 Subject: [PATCH 02/17] modified: project/modules/CyberSentinel/.env.template new file: project/modules/CyberSentinel/preprocess/__init__.py modified: project/modules/CyberSentinel/preprocess/data_labeler.py modified: project/modules/CyberSentinel/preprocess/preprocessing.py --- project/modules/CyberSentinel/.env.template | 26 ++++++++++++++++++- .../CyberSentinel/preprocess/__init__.py | 0 .../CyberSentinel/preprocess/data_labeler.py | 26 ++++++++++++------- .../CyberSentinel/preprocess/preprocessing.py | 15 +++++++++-- 4 files changed, 55 insertions(+), 12 deletions(-) create mode 100644 project/modules/CyberSentinel/preprocess/__init__.py diff --git a/project/modules/CyberSentinel/.env.template b/project/modules/CyberSentinel/.env.template index 44f9c90..fe186ea 100644 --- a/project/modules/CyberSentinel/.env.template +++ b/project/modules/CyberSentinel/.env.template @@ -1,5 +1,29 @@ -TRAINING_DATA_PATH= +# Model Training Configuration +TRAINING_DATA_PATH=project/training-data/ LEARNING_RATE=0.001 BATCH_SIZE=32 EPOCHS=10 L2_REG=0.01 + +# Temporary file paths for DataLabeler +TEMP_PDF_FILE_PATH=temp_pdf_data.csv +TEMP_TXT_FILE_PATH=temp_txt_data.csv + +# Path to save labeled data +LABELED_DATA_FILE_PATH= + +# Preprocessor Configuration (Use '.' for current working directory) +INPUT_FILE_PATH=.. +PREPROCESSED_DATA_FILE_PATH= + + +TRAINING_DATA_PATH=/absolute/path/to/training_data.csv +LEARNING_RATE=0.001 +BATCH_SIZE=32 +EPOCHS=10 +L2_REG=0.01 +TEMP_PDF_FILE_PATH=/absolute/path/to/temp_pdf_data.csv +TEMP_TXT_FILE_PATH=/absolute/path/to/temp_txt_data.csv +LABELED_DATA_FILE_PATH=/absolute/path/to/labeled_data.csv +INPUT_FILE_PATH=/absolute/path/to/input_files +PREPROCESSED_DATA_FILE_PATH=/absolute/path/to/preprocessed_data diff --git a/project/modules/CyberSentinel/preprocess/__init__.py b/project/modules/CyberSentinel/preprocess/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/project/modules/CyberSentinel/preprocess/data_labeler.py b/project/modules/CyberSentinel/preprocess/data_labeler.py index c230deb..c4bae7e 100644 --- a/project/modules/CyberSentinel/preprocess/data_labeler.py +++ b/project/modules/CyberSentinel/preprocess/data_labeler.py @@ -1,11 +1,20 @@ +from dotenv import load_dotenv import os import csv from typing import List, Tuple +# Point to the location of the .env file relative to the script's location +env_path = os.path.join(os.path.dirname(__file__), '../../../.env') + +# Load the .env file +load_dotenv(dotenv_path=env_path) + class DataLabeler: - def __init__(self, temp_pdf_file_path: str = "temp_pdf_data.csv", temp_txt_file_path: str = "temp_txt_data.csv"): - self.temp_pdf_file_path = temp_pdf_file_path - self.temp_txt_file_path = temp_txt_file_path + def __init__(self): + default_temp_path = os.path.dirname(__file__) + self.temp_pdf_file_path = os.getenv('TEMP_PDF_FILE_PATH', os.path.join(default_temp_path, 'temp_pdf_data.csv')) + self.temp_txt_file_path = os.getenv('TEMP_TXT_FILE_PATH', os.path.join(default_temp_path, 'temp_txt_data.csv')) + self.output_file_path = os.getenv('LABELED_DATA_FILE_PATH') self.labeled_pdf_data = self.load_temp_data(self.temp_pdf_file_path) self.labeled_txt_data = self.load_temp_data(self.temp_txt_file_path) @@ -36,13 +45,13 @@ def load_temp_data(self, file_path: str) -> List[Tuple[str, bool]]: def label_data(self, data: List[str]) -> List[Tuple[str, bool]]: labeled_data = [] for text in data: - print("\nSample:") - print(text) + print(f"\\nSample:{text}") label = self.get_user_input("Does this text indicate the intention to commit acts of hate-based violence? (True/False): ") labeled_data.append((text, label)) return labeled_data - def save_labeled_data_to_csv(self, labeled_data: List[Tuple[str, bool]], file_path: str): + def save_labeled_data_to_csv(self, labeled_data: List[Tuple[str, bool]]): + file_path = self.output_file_path or input("Enter the path to save the labeled data: ") with open(file_path, 'w', newline='', encoding='utf-8') as file: writer = csv.writer(file) writer.writerow(['text', 'label']) @@ -52,12 +61,11 @@ def save_labeled_data_to_csv(self, labeled_data: List[Tuple[str, bool]], file_pa if __name__ == "__main__": data_labeler = DataLabeler() # Load the preprocessed data from the file saved by the Preprocessor - file_path = input("Enter the path to the preprocessed data file: ") + file_path = os.getenv('PREPROCESSED_DATA_FILE_PATH') or input("Enter the path to the preprocessed data file: ") with open(file_path, 'r', encoding='utf-8') as file: reader = csv.reader(file) next(reader) # Skip the header data = [row[0] for row in reader] labeled_data = data_labeler.label_data(data) - output_file_path = input("Enter the path to save the labeled data: ") - data_labeler.save_labeled_data_to_csv(labeled_data, output_file_path) + data_labeler.save_labeled_data_to_csv(labeled_data) diff --git a/project/modules/CyberSentinel/preprocess/preprocessing.py b/project/modules/CyberSentinel/preprocess/preprocessing.py index 8ddfe5b..3eba7bd 100644 --- a/project/modules/CyberSentinel/preprocess/preprocessing.py +++ b/project/modules/CyberSentinel/preprocess/preprocessing.py @@ -1,7 +1,9 @@ """ Defines functions for ingesting files, lemmatizes and removeing stop words, and tokenization. """ +from dotenv import load_dotenv import os import re +import csv from PyPDF2 import PdfFileReader from nltk.stem import WordNetLemmatizer from nltk.corpus import stopwords @@ -13,8 +15,17 @@ nltk.download('wordnet') nltk.download('stopwords') +# Point to the location of the .env file relative to the script's location +env_path = os.path.join(os.path.dirname(__file__), '../../../.env') + +# Load the .env file +load_dotenv(dotenv_path=env_path) + class Preprocessor: def __init__(self): + default_temp_path = os.path.dirname(__file__) + self.input_file_path = os.getenv('INPUT_FILE_PATH') + self.output_file_path = os.getenv('PREPROCESSED_DATA_FILE_PATH', f"processed_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv") self.lemmatizer = WordNetLemmatizer() self.stop_words = set(stopwords.words('english')) @@ -75,7 +86,7 @@ def load_data(self, file_path: str) -> List[str]: def save_processed_data(self, processed_data: List[str], file_type: str = "csv"): timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - file_path = f"processed_data_{timestamp}.{file_type}" + file_path = os.getenv('PREPROCESSED_DATA_FILE_PATH', f"processed_data_{timestamp}.{file_type}") with open(file_path, 'w', newline='', encoding='utf-8') as file: writer = csv.writer(file) writer.writerow(['text']) @@ -85,6 +96,6 @@ def save_processed_data(self, processed_data: List[str], file_type: str = "csv") if __name__ == "__main__": preprocessor = Preprocessor() - file_path = input("Enter the path to the file or '.' to process all PDF and TXT files in the current directory: ") + file_path = os.getenv('INPUT_FILE_PATH') or input("Enter the path to the file or '.' to process all PDF and TXT files in the current directory: ") processed_data = preprocessor.load_data(file_path) preprocessor.save_processed_data(processed_data) \ No newline at end of file From e9497582a0b9371525d25b934a769fb63d7d5962 Mon Sep 17 00:00:00 2001 From: Daemon <109057945+Daethyra@users.noreply.github.com> Date: Mon, 14 Aug 2023 18:30:38 -0700 Subject: [PATCH 03/17] modified: project/modules/CyberSentinel/.env.template --- project/modules/CyberSentinel/.env.template | 24 ++++++--------------- 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/project/modules/CyberSentinel/.env.template b/project/modules/CyberSentinel/.env.template index fe186ea..fd923b4 100644 --- a/project/modules/CyberSentinel/.env.template +++ b/project/modules/CyberSentinel/.env.template @@ -1,29 +1,17 @@ # Model Training Configuration -TRAINING_DATA_PATH=project/training-data/ +TRAINING_DATA_PATH=project/modules/CyberSentinel/training-data/ LEARNING_RATE=0.001 BATCH_SIZE=32 EPOCHS=10 L2_REG=0.01 +# Preprocessor Configuration (Use '.' for current working directory) +INPUT_FILE_PATH= +PREPROCESSED_DATA_FILE_PATH=project/modules/CyberSentinel/training-data/Processed-Data/ + # Temporary file paths for DataLabeler TEMP_PDF_FILE_PATH=temp_pdf_data.csv TEMP_TXT_FILE_PATH=temp_txt_data.csv # Path to save labeled data -LABELED_DATA_FILE_PATH= - -# Preprocessor Configuration (Use '.' for current working directory) -INPUT_FILE_PATH=.. -PREPROCESSED_DATA_FILE_PATH= - - -TRAINING_DATA_PATH=/absolute/path/to/training_data.csv -LEARNING_RATE=0.001 -BATCH_SIZE=32 -EPOCHS=10 -L2_REG=0.01 -TEMP_PDF_FILE_PATH=/absolute/path/to/temp_pdf_data.csv -TEMP_TXT_FILE_PATH=/absolute/path/to/temp_txt_data.csv -LABELED_DATA_FILE_PATH=/absolute/path/to/labeled_data.csv -INPUT_FILE_PATH=/absolute/path/to/input_files -PREPROCESSED_DATA_FILE_PATH=/absolute/path/to/preprocessed_data +LABELED_DATA_FILE_PATH=project/modules/CyberSentinel/preprocess/ \ No newline at end of file From b78c9e4a234a2a38daf5721e5e1cb8a6792b04ad Mon Sep 17 00:00:00 2001 From: Daemon <109057945+Daethyra@users.noreply.github.com> Date: Mon, 14 Aug 2023 21:52:38 -0700 Subject: [PATCH 04/17] modified: project/setup/orchestration/Dockerfile --- project/setup/orchestration/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/project/setup/orchestration/Dockerfile b/project/setup/orchestration/Dockerfile index 5928fbb..6a07a02 100644 --- a/project/setup/orchestration/Dockerfile +++ b/project/setup/orchestration/Dockerfile @@ -18,7 +18,7 @@ COPY requirements.txt /app/requirements.txt # Update pip and install any needed packages specified in requirements.txt RUN pip install --upgrade pip -RUN pip install --trusted-host pypi.python.org -r requirements.txt +RUN pip install --no-cache-dir -r requirements.txt # Second stage: Create the final image FROM python:3.11-slim From 7d0157c1640f142f51f298b9d3fd5cd0b1a709a6 Mon Sep 17 00:00:00 2001 From: Daemon <109057945+Daethyra@users.noreply.github.com> Date: Wed, 16 Aug 2023 21:29:42 -0700 Subject: [PATCH 05/17] modified: .github/dependabot.yml renamed: project/setup/orchestration/Dockerfile -> Dockerfile new file: docker-compose.yml renamed: documents/supplementary-docs/Bad-Words/ListOfDirtyWords.txt -> documents/supplementary-information/Bad-Words/ListOfDirtyWords.txt renamed: documents/supplementary-docs/Bad-Words/VIOLENT_TERRORIST_WORDS.txt -> documents/supplementary-information/Bad-Words/VIOLENT_TERRORIST_WORDS.txt renamed: documents/supplementary-docs/Bad-Words/bad_Words_list.txt -> documents/supplementary-information/Bad-Words/bad_Words_list.txt renamed: documents/supplementary-docs/Bad-Words/badwords.txt -> documents/supplementary-information/Bad-Words/badwords.txt renamed: documents/supplementary-docs/Bad-Words/citations.md -> documents/supplementary-information/Bad-Words/citations.md renamed: documents/supplementary-docs/Bad-Words/cmu-bad-words.txt -> documents/supplementary-information/Bad-Words/cmu-bad-words.txt renamed: documents/supplementary-docs/Conservative News Domains/CND_s1.txt -> documents/supplementary-information/Conservative News Domains/CND_s1.txt new file: documents/supplementary-information/explanation.txt modified: project/modules/CyberSentinel/preprocess/preprocessing.py deleted: project/modules/CyberSentinel/web-ui deleted: project/setup/orchestration/docker-compose.yml deleted: project/setup/orchestration/requirements.txt deleted: project/setup/setup.py deleted: project/tests/not_inprogress.txt renamed: project/setup/pyproject.toml -> pyproject.toml --- .github/dependabot.yml | 4 +- .../orchestration/Dockerfile => Dockerfile | 0 docker-compose.yml | 10 +++ .../Bad-Words/ListOfDirtyWords.txt | 0 .../Bad-Words/VIOLENT_TERRORIST_WORDS.txt | 0 .../Bad-Words/bad_Words_list.txt | 0 .../Bad-Words/badwords.txt | 0 .../Bad-Words/citations.md | 0 .../Bad-Words/cmu-bad-words.txt | 0 .../Conservative News Domains/CND_s1.txt | 0 .../supplementary-information/explanation.txt | 2 + .../CyberSentinel/preprocess/preprocessing.py | 80 ++++++++++++------- project/modules/CyberSentinel/web-ui | 1 - .../setup/orchestration/docker-compose.yml | 44 ---------- project/setup/orchestration/requirements.txt | 5 -- project/setup/setup.py | 5 -- project/tests/not_inprogress.txt | 1 - .../setup/pyproject.toml => pyproject.toml | 18 +++-- 18 files changed, 73 insertions(+), 97 deletions(-) rename project/setup/orchestration/Dockerfile => Dockerfile (100%) create mode 100644 docker-compose.yml rename documents/{supplementary-docs => supplementary-information}/Bad-Words/ListOfDirtyWords.txt (100%) rename documents/{supplementary-docs => supplementary-information}/Bad-Words/VIOLENT_TERRORIST_WORDS.txt (100%) rename documents/{supplementary-docs => supplementary-information}/Bad-Words/bad_Words_list.txt (100%) rename documents/{supplementary-docs => supplementary-information}/Bad-Words/badwords.txt (100%) rename documents/{supplementary-docs => supplementary-information}/Bad-Words/citations.md (100%) rename documents/{supplementary-docs => supplementary-information}/Bad-Words/cmu-bad-words.txt (100%) rename documents/{supplementary-docs => supplementary-information}/Conservative News Domains/CND_s1.txt (100%) create mode 100644 documents/supplementary-information/explanation.txt delete mode 160000 project/modules/CyberSentinel/web-ui delete mode 100644 project/setup/orchestration/docker-compose.yml delete mode 100644 project/setup/orchestration/requirements.txt delete mode 100644 project/setup/setup.py delete mode 100644 project/tests/not_inprogress.txt rename project/setup/pyproject.toml => pyproject.toml (52%) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index ceba556..d831a2f 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -20,7 +20,7 @@ updates: # Check for npm updates at 9am UTC time: "09:00" timezone: "America/Los_Angeles" - target-branch: "v1.3.2" + target-branch: "v1.5" versioning-strategy: auto - package-ecosystem: "pip" @@ -30,7 +30,7 @@ updates: # Check for npm updates at 9am UTC time: "09:00" timezone: "America/Los_Angeles" - target-branch: "v1.3.3" + target-branch: "v1.5.1" versioning-strategy: auto diff --git a/project/setup/orchestration/Dockerfile b/Dockerfile similarity index 100% rename from project/setup/orchestration/Dockerfile rename to Dockerfile diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..60f5913 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,10 @@ +version: '3.9' + +services: + opendts: + build: + context: ./ + dockerfile: Dockerfile + target: builder + volumes: + - ./:/app \ No newline at end of file diff --git a/documents/supplementary-docs/Bad-Words/ListOfDirtyWords.txt b/documents/supplementary-information/Bad-Words/ListOfDirtyWords.txt similarity index 100% rename from documents/supplementary-docs/Bad-Words/ListOfDirtyWords.txt rename to documents/supplementary-information/Bad-Words/ListOfDirtyWords.txt diff --git a/documents/supplementary-docs/Bad-Words/VIOLENT_TERRORIST_WORDS.txt b/documents/supplementary-information/Bad-Words/VIOLENT_TERRORIST_WORDS.txt similarity index 100% rename from documents/supplementary-docs/Bad-Words/VIOLENT_TERRORIST_WORDS.txt rename to documents/supplementary-information/Bad-Words/VIOLENT_TERRORIST_WORDS.txt diff --git a/documents/supplementary-docs/Bad-Words/bad_Words_list.txt b/documents/supplementary-information/Bad-Words/bad_Words_list.txt similarity index 100% rename from documents/supplementary-docs/Bad-Words/bad_Words_list.txt rename to documents/supplementary-information/Bad-Words/bad_Words_list.txt diff --git a/documents/supplementary-docs/Bad-Words/badwords.txt b/documents/supplementary-information/Bad-Words/badwords.txt similarity index 100% rename from documents/supplementary-docs/Bad-Words/badwords.txt rename to documents/supplementary-information/Bad-Words/badwords.txt diff --git a/documents/supplementary-docs/Bad-Words/citations.md b/documents/supplementary-information/Bad-Words/citations.md similarity index 100% rename from documents/supplementary-docs/Bad-Words/citations.md rename to documents/supplementary-information/Bad-Words/citations.md diff --git a/documents/supplementary-docs/Bad-Words/cmu-bad-words.txt b/documents/supplementary-information/Bad-Words/cmu-bad-words.txt similarity index 100% rename from documents/supplementary-docs/Bad-Words/cmu-bad-words.txt rename to documents/supplementary-information/Bad-Words/cmu-bad-words.txt diff --git a/documents/supplementary-docs/Conservative News Domains/CND_s1.txt b/documents/supplementary-information/Conservative News Domains/CND_s1.txt similarity index 100% rename from documents/supplementary-docs/Conservative News Domains/CND_s1.txt rename to documents/supplementary-information/Conservative News Domains/CND_s1.txt diff --git a/documents/supplementary-information/explanation.txt b/documents/supplementary-information/explanation.txt new file mode 100644 index 0000000..1665214 --- /dev/null +++ b/documents/supplementary-information/explanation.txt @@ -0,0 +1,2 @@ +These subdirectories contain contextual information for the project. +- The AI may make use of everything inside 'supplementary-information/' \ No newline at end of file diff --git a/project/modules/CyberSentinel/preprocess/preprocessing.py b/project/modules/CyberSentinel/preprocess/preprocessing.py index 3eba7bd..9d4ac11 100644 --- a/project/modules/CyberSentinel/preprocess/preprocessing.py +++ b/project/modules/CyberSentinel/preprocess/preprocessing.py @@ -1,9 +1,11 @@ -""" Defines functions for ingesting files, lemmatizes and removeing stop words, and tokenization. """ +""" Defines functions for ingesting files, lemmatizes and removing stop words, and tokenization. """ from dotenv import load_dotenv import os import re import csv +import chardet +import logging from PyPDF2 import PdfFileReader from nltk.stem import WordNetLemmatizer from nltk.corpus import stopwords @@ -21,41 +23,48 @@ # Load the .env file load_dotenv(dotenv_path=env_path) +logging.basicConfig(filename='preprocessing_%Y%m%d_%H%M%S.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') + class Preprocessor: def __init__(self): - default_temp_path = os.path.dirname(__file__) self.input_file_path = os.getenv('INPUT_FILE_PATH') self.output_file_path = os.getenv('PREPROCESSED_DATA_FILE_PATH', f"processed_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv") self.lemmatizer = WordNetLemmatizer() self.stop_words = set(stopwords.words('english')) - def read_pdf(self, file_path: str) -> str: - text = '' + def read_file(self, file_path: str, reader_func) -> str: + result = '' if file_path == '.': - pdf_files = [f for f in os.listdir() if os.path.isfile(f) and f.lower().endswith('.pdf')] - for pdf_file in pdf_files: - with open(pdf_file, 'rb') as file: - pdf_reader = PdfFileReader(file) - for page in range(pdf_reader.getNumPages()): - text += pdf_reader.getPage(page).extractText() + for root, _, files in os.walk(file_path): + for file_name in files: + full_path = os.path.join(root, file_name) + result += self.read_with_detected_encoding(full_path, reader_func) else: - with open(file_path, 'rb') as file: - pdf_reader = PdfFileReader(file) - for page in range(pdf_reader.getNumPages()): - text += pdf_reader.getPage(page).extractText() + result = self.read_with_detected_encoding(file_path, reader_func) + return result + + def read_with_detected_encoding(self, file_path: str, reader_func) -> str: + with open(file_path, 'rb') as file: + rawdata = file.read() + result = chardet.detect(rawdata) + encoding = result['encoding'] + file.seek(0) # Reset the file pointer to the beginning + try: + text = reader_func(file, encoding) + except Exception as e: + logging.warning(f"Failed to process {file_path} with encoding {encoding}: {e}") + text = '' return text - def read_txt(self, file_path: str) -> List[str]: - lines = [] - if file_path == '.': - txt_files = [f for f in os.listdir() if os.path.isfile(f) and f.lower().endswith('.txt')] - for txt_file in txt_files: - with open(txt_file, 'r', encoding='utf-8') as file: - lines += file.readlines() - else: - with open(file_path, 'r', encoding='utf-8') as file: - lines = file.readlines() - return [line.strip() for line in lines if line.strip()] + def read_pdf(self, file, encoding: str) -> str: + text = '' + pdf_reader = PdfFileReader(file) + for page in range(pdf_reader.getNumPages()): + text += pdf_reader.getPage(page).extractText() + return text + + def read_txt(self, file, encoding: str) -> List[str]: + return [line.strip() for line in file.read().decode(encoding).splitlines() if line.strip()] def preprocess_text_data(self, text: str) -> str: # Tokenization @@ -70,14 +79,23 @@ def preprocess_text_data(self, text: str) -> str: tokenized_text = " ".join(tokens) return tokenized_text - def preprocess_txt_implicit_hate_comments(self, comments: List[str]) -> List[str]: - return [re.split(r':', comment, maxsplit=2)[-1] for comment in comments] - def load_data(self, file_path: str) -> List[str]: + if not os.path.exists(file_path) and file_path != '.': + raise ValueError("File path does not exist.") + if file_path.lower().endswith('.pdf'): - raw_data = self.read_pdf(file_path) + raw_data = self.read_file(file_path, self.read_pdf) elif file_path.lower().endswith('.txt'): - raw_data = self.read_txt(file_path) + raw_data = self.read_file(file_path, self.read_txt) + elif os.path.isdir(file_path): + raw_data = '' + for root, _, files in os.walk(file_path): + for file_name in files: + full_path = os.path.join(root, file_name) + if full_path.lower().endswith('.pdf'): + raw_data += self.read_pdf(full_path) + elif full_path.lower().endswith('.txt'): + raw_data += self.read_txt(full_path) else: raise ValueError("Unsupported file format.") @@ -98,4 +116,4 @@ def save_processed_data(self, processed_data: List[str], file_type: str = "csv") preprocessor = Preprocessor() file_path = os.getenv('INPUT_FILE_PATH') or input("Enter the path to the file or '.' to process all PDF and TXT files in the current directory: ") processed_data = preprocessor.load_data(file_path) - preprocessor.save_processed_data(processed_data) \ No newline at end of file + preprocessor.save_processed_data(processed_data) diff --git a/project/modules/CyberSentinel/web-ui b/project/modules/CyberSentinel/web-ui deleted file mode 160000 index 793a768..0000000 --- a/project/modules/CyberSentinel/web-ui +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 793a768d5390f4af3146cb8fd5504c4d7b4e1511 diff --git a/project/setup/orchestration/docker-compose.yml b/project/setup/orchestration/docker-compose.yml deleted file mode 100644 index 9d5bedf..0000000 --- a/project/setup/orchestration/docker-compose.yml +++ /dev/null @@ -1,44 +0,0 @@ -#outdated as of 8-2-23 -version: '3.9' - -services: - opendts: - build: - context: ./ - dockerfile: Dockerfile - target: builder - volumes: - - ./:/app - - threader: - build: - context: ./project/modules/AutoThreader - dockerfile: Dockerfile - target: threader-${BUILD_TYPE} # Configurate your build type (-dev | -release) - args: - - BUILD_TYPE=release - env_file: - - .env # Ensure your variables have been set before running 'Docker-Compose' - environment: - - THREADS_USERNAME=${THREADS_USERNAME} - - THREADS_PASSWORD=${THREADS_PASSWORD} - - WEBHOOK_SECRET=${THREADS_WEBHOOK_SECRET} - volumes: - - ./project/modules/AutoThreader:/app - - ./project/modules/AutoThreader/docker-compose.yml:/app/docker-compose.yml:ro - - ./project/modules/AutoThreader/Dockerfile:/app/Dockerfile:ro - - auto-gpt: - build: - context: ./project/modules/AutoThreader - dockerfile: Dockerfile - target: auto-gpt-${BUILD_TYPE} # Configurate your build type (-dev | -release) - args: - - BUILD_TYPE=dev - env_file: - - .env - volumes: - - ./project/modules/AutoThreader:/app - - ./project/modules/AutoThreader/docker-compose.yml:/app/docker-compose.yml:ro - - ./project/modules/AutoThreader/Dockerfile:/app/Dockerfile:ro - profiles: ["exclude-from-up"] diff --git a/project/setup/orchestration/requirements.txt b/project/setup/orchestration/requirements.txt deleted file mode 100644 index 1dc3f15..0000000 --- a/project/setup/orchestration/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -scikit-learn -tensorflow -pandas -PyPDF2 -nltk \ No newline at end of file diff --git a/project/setup/setup.py b/project/setup/setup.py deleted file mode 100644 index 120de9c..0000000 --- a/project/setup/setup.py +++ /dev/null @@ -1,5 +0,0 @@ -import os -from project.modules.autogpt import main -from project.modules.AutoThreader.threads_py import ThreadsAPI - -data = "../../documents/Database/CyberSentinel_Training-Data" \ No newline at end of file diff --git a/project/tests/not_inprogress.txt b/project/tests/not_inprogress.txt deleted file mode 100644 index eabb290..0000000 --- a/project/tests/not_inprogress.txt +++ /dev/null @@ -1 +0,0 @@ -nope. \ No newline at end of file diff --git a/project/setup/pyproject.toml b/pyproject.toml similarity index 52% rename from project/setup/pyproject.toml rename to pyproject.toml index cfc9aae..a0be77e 100644 --- a/project/setup/pyproject.toml +++ b/pyproject.toml @@ -4,20 +4,22 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "OpenDTS" -version = "1.3.2" -description = "Description" +version = "1.5.1" +description = "Open Domestic Threat Scanner - LLM Threat-Intelligence" authors = ["Daethyra dev-daethyra@protonmail.com"] [tool.poetry.dependencies] -[tool.poetry.dependencies] -python = "^3.11.3" +python = ">=3.11.3,<3.12" +numpy = ">=1.22,<=1.24.3" scikit-learn = "^0.24.2" -tensorflow = "^2.6.0" -pandas = "^1.3.3" -python-dotenv = "^0.19.1" -PyPDF2 = "^2.3.1" +tensorflow-io-gcs-filesystem = "^0.31.0" +tensorflow = "^2.13.0" +pandas = "^1.5.3" +python-dotenv = "^0.19.2" +PyPDF2 = "^2.12.1" nltk = "^3.8.1" [tool.poetry.dev-dependencies] + [tool.poetry.scripts] From 1579c96b409ab1fd75aec5de4b086a5cf5fbe22d Mon Sep 17 00:00:00 2001 From: Daemon <109057945+Daethyra@users.noreply.github.com> Date: Wed, 16 Aug 2023 21:43:50 -0700 Subject: [PATCH 06/17] modified: project/modules/CyberSentinel/preprocess/preprocessing.py --- .../CyberSentinel/preprocess/preprocessing.py | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/project/modules/CyberSentinel/preprocess/preprocessing.py b/project/modules/CyberSentinel/preprocess/preprocessing.py index 9d4ac11..8448c18 100644 --- a/project/modules/CyberSentinel/preprocess/preprocessing.py +++ b/project/modules/CyberSentinel/preprocess/preprocessing.py @@ -12,6 +12,9 @@ import nltk from typing import List from datetime import datetime +from urllib.parse import urlparse +import urllib.request +import shutil # Downloading NLTK resources if not already present nltk.download('wordnet') @@ -32,6 +35,22 @@ def __init__(self): self.lemmatizer = WordNetLemmatizer() self.stop_words = set(stopwords.words('english')) + def validate_input_path(self, file_path: str) -> str: + # Check if the file path is an HTTPS link + parsed_url = urlparse(file_path) + if parsed_url.scheme == "https": + # Download the file to a temporary location + temp_file_path = "temp_file" + with urllib.request.urlopen(file_path) as response, open(temp_file_path, 'wb') as out_file: + shutil.copyfileobj(response, out_file) + return temp_file_path + + # Check if the file path exists, if not create the directories + if not os.path.exists(file_path) and file_path != '.': + os.makedirs(os.path.dirname(file_path), exist_ok=True) + + return file_path + def read_file(self, file_path: str, reader_func) -> str: result = '' if file_path == '.': @@ -80,6 +99,7 @@ def preprocess_text_data(self, text: str) -> str: return tokenized_text def load_data(self, file_path: str) -> List[str]: + file_path = self.validate_input_path(file_path) if not os.path.exists(file_path) and file_path != '.': raise ValueError("File path does not exist.") From f804ace5c7f4923ba2416e5e9b39aec6a3dba34b Mon Sep 17 00:00:00 2001 From: Daemon <109057945+Daethyra@users.noreply.github.com> Date: Wed, 16 Aug 2023 21:47:38 -0700 Subject: [PATCH 07/17] Updated dependabot --- .github/dependabot-misc.yml | 24 ++++++++++++++ .github/dependabot-python.yml | 24 ++++++++++++++ .github/dependabot.yml | 60 ----------------------------------- 3 files changed, 48 insertions(+), 60 deletions(-) create mode 100644 .github/dependabot-misc.yml create mode 100644 .github/dependabot-python.yml delete mode 100644 .github/dependabot.yml diff --git a/.github/dependabot-misc.yml b/.github/dependabot-misc.yml new file mode 100644 index 0000000..e4e964e --- /dev/null +++ b/.github/dependabot-misc.yml @@ -0,0 +1,24 @@ + # Enable version updates for npm + - package-ecosystem: "npm" + # Look for `package.json` and `lock` files in the `root` directory + directory: "/project/modules/" + # Check the npm registry for updates every day (weekdays) + schedule: + interval: "daily" + # Check for npm updates at 9am UTC + time: "09:00" + timezone: "America/Los_Angeles" + target-branch: "master" + versioning-strategy: auto + + # Enable version updates for Docker + - package-ecosystem: "docker" + # Look for a `Dockerfile` in the `root` directory + directory: "/" + # Check for updates once a week + schedule: + interval: "daily" + # Check for npm updates at 9am UTC + time: "09:00" + timezone: "America/Los_Angeles" + target-branch: "master" \ No newline at end of file diff --git a/.github/dependabot-python.yml b/.github/dependabot-python.yml new file mode 100644 index 0000000..1e6da62 --- /dev/null +++ b/.github/dependabot-python.yml @@ -0,0 +1,24 @@ +# Please see the documentation for all configuration options: +# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates +version: 2 +updates: + # Enable version updates for multiple branches + - package-ecosystem: "pip" + directory: "/" + schedule: + interval: "daily" + # Check for pip updates at 9am UTC + time: "09:00" + timezone: "America/Los_Angeles" + target-branch: "master" + versioning-strategy: auto + + - package-ecosystem: "pip" + directory: "/" + schedule: + interval: "daily" + # Check for pip updates at 9am UTC + time: "09:00" + timezone: "America/Los_Angeles" + target-branch: "v1.5*" + versioning-strategy: auto \ No newline at end of file diff --git a/.github/dependabot.yml b/.github/dependabot.yml deleted file mode 100644 index d831a2f..0000000 --- a/.github/dependabot.yml +++ /dev/null @@ -1,60 +0,0 @@ -# Please see the documentation for all configuration options: -# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates -version: 2 -updates: - # Enable version updates for multiple branches - - package-ecosystem: "pip" - directory: "/" - schedule: - interval: "daily" - # Check for npm updates at 9am UTC - time: "09:00" - timezone: "America/Los_Angeles" - target-branch: "master" - versioning-strategy: auto - - - package-ecosystem: "pip" - directory: "/" - schedule: - interval: "daily" - # Check for npm updates at 9am UTC - time: "09:00" - timezone: "America/Los_Angeles" - target-branch: "v1.5" - versioning-strategy: auto - - - package-ecosystem: "pip" - directory: "/" - schedule: - interval: "daily" - # Check for npm updates at 9am UTC - time: "09:00" - timezone: "America/Los_Angeles" - target-branch: "v1.5.1" - versioning-strategy: auto - - - # Enable version updates for npm - - package-ecosystem: "npm" - # Look for `package.json` and `lock` files in the `root` directory - directory: "/" - # Check the npm registry for updates every day (weekdays) - schedule: - interval: "daily" - # Check for npm updates at 9am UTC - time: "09:00" - timezone: "America/Los_Angeles" - target-branch: "master" - versioning-strategy: auto - - # Enable version updates for Docker - - package-ecosystem: "docker" - # Look for a `Dockerfile` in the `root` directory - directory: "/" - # Check for updates once a week - schedule: - interval: "daily" - # Check for npm updates at 9am UTC - time: "09:00" - timezone: "America/Los_Angeles" - target-branch: "master" From c4bf5533a114b61cb6a2161a73dc262568789385 Mon Sep 17 00:00:00 2001 From: Daemon <109057945+Daethyra@users.noreply.github.com> Date: Wed, 16 Aug 2023 21:52:27 -0700 Subject: [PATCH 08/17] modified: .github/workflows/docker-ci.yml modified: .github/workflows/greetings.yml --- .github/workflows/docker-ci.yml | 2 +- .github/workflows/greetings.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/docker-ci.yml b/.github/workflows/docker-ci.yml index 271afa1..f4b7dce 100644 --- a/.github/workflows/docker-ci.yml +++ b/.github/workflows/docker-ci.yml @@ -37,7 +37,7 @@ jobs: name: Build image uses: docker/build-push-action@v3 with: - context: ./project/setup/orchestration/ + context: ./ build-args: BUILD_TYPE=${{ matrix.build-type }} tags: ${{ env.IMAGE_NAME }} load: true # save to docker images diff --git a/.github/workflows/greetings.yml b/.github/workflows/greetings.yml index 4677434..c1dfd39 100644 --- a/.github/workflows/greetings.yml +++ b/.github/workflows/greetings.yml @@ -12,5 +12,5 @@ jobs: - uses: actions/first-interaction@v1 with: repo-token: ${{ secrets.GITHUB_TOKEN }} - issue-message: "Message that will be displayed on users' first issue" - pr-message: "Message that will be displayed on users' first pull request" + issue-message: "Thanks for chippin' in, choom. We'll have this looked at right away." + pr-message: "OK! Big legend! Nice job, can't wait to see your work." From 7e6580ed4e5a7e68fbb2b023217bdc8d30dedb2b Mon Sep 17 00:00:00 2001 From: Daemon <109057945+Daethyra@users.noreply.github.com> Date: Wed, 16 Aug 2023 21:57:13 -0700 Subject: [PATCH 09/17] modified: .github/workflows/docker-ci.yml renamed: Dockerfile -> project/setup/orchestration/Dockerfile renamed: docker-compose.yml -> project/setup/orchestration/docker-compose.yml new file: project/setup/orchestration/requirements.txt --- .github/workflows/docker-ci.yml | 2 +- Dockerfile => project/setup/orchestration/Dockerfile | 0 .../setup/orchestration/docker-compose.yml | 0 project/setup/orchestration/requirements.txt | 9 +++++++++ 4 files changed, 10 insertions(+), 1 deletion(-) rename Dockerfile => project/setup/orchestration/Dockerfile (100%) rename docker-compose.yml => project/setup/orchestration/docker-compose.yml (100%) create mode 100644 project/setup/orchestration/requirements.txt diff --git a/.github/workflows/docker-ci.yml b/.github/workflows/docker-ci.yml index f4b7dce..271afa1 100644 --- a/.github/workflows/docker-ci.yml +++ b/.github/workflows/docker-ci.yml @@ -37,7 +37,7 @@ jobs: name: Build image uses: docker/build-push-action@v3 with: - context: ./ + context: ./project/setup/orchestration/ build-args: BUILD_TYPE=${{ matrix.build-type }} tags: ${{ env.IMAGE_NAME }} load: true # save to docker images diff --git a/Dockerfile b/project/setup/orchestration/Dockerfile similarity index 100% rename from Dockerfile rename to project/setup/orchestration/Dockerfile diff --git a/docker-compose.yml b/project/setup/orchestration/docker-compose.yml similarity index 100% rename from docker-compose.yml rename to project/setup/orchestration/docker-compose.yml diff --git a/project/setup/orchestration/requirements.txt b/project/setup/orchestration/requirements.txt new file mode 100644 index 0000000..969b626 --- /dev/null +++ b/project/setup/orchestration/requirements.txt @@ -0,0 +1,9 @@ +python>=3.11.3,<3.12 +numpy==1.24.3 +scikit-learn==0.24.2 +tensorflow-io-gcs-filesystem==0.31.0 +tensorflow==2.13.0 +pandas==1.5.3 +python-dotenv==0.19.2 +PyPDF2==2.12.1 +nltk==3.8.1 From cb985588d47afc6a7d5110ef5d6147c68a0b2ecd Mon Sep 17 00:00:00 2001 From: Daemon <109057945+Daethyra@users.noreply.github.com> Date: Wed, 16 Aug 2023 22:05:03 -0700 Subject: [PATCH 10/17] modified: project/setup/orchestration/requirements.txt --- project/setup/orchestration/requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/project/setup/orchestration/requirements.txt b/project/setup/orchestration/requirements.txt index 969b626..2e9aff9 100644 --- a/project/setup/orchestration/requirements.txt +++ b/project/setup/orchestration/requirements.txt @@ -1,4 +1,3 @@ -python>=3.11.3,<3.12 numpy==1.24.3 scikit-learn==0.24.2 tensorflow-io-gcs-filesystem==0.31.0 From 77a71e1264946b539b0b3e6d32b0a8f3fb435166 Mon Sep 17 00:00:00 2001 From: Daemon <109057945+Daethyra@users.noreply.github.com> Date: Wed, 16 Aug 2023 22:08:03 -0700 Subject: [PATCH 11/17] modified: project/setup/orchestration/docker-compose.yml --- project/setup/orchestration/docker-compose.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/project/setup/orchestration/docker-compose.yml b/project/setup/orchestration/docker-compose.yml index 60f5913..38e4354 100644 --- a/project/setup/orchestration/docker-compose.yml +++ b/project/setup/orchestration/docker-compose.yml @@ -3,8 +3,8 @@ version: '3.9' services: opendts: build: - context: ./ + context: ./project/setup/orchestration dockerfile: Dockerfile target: builder volumes: - - ./:/app \ No newline at end of file + - ./:/app From 8b41a95e06f64355173a529cf389ac026b2c76d9 Mon Sep 17 00:00:00 2001 From: Daemon <109057945+Daethyra@users.noreply.github.com> Date: Thu, 17 Aug 2023 20:27:09 -0700 Subject: [PATCH 12/17] deleted: .github/workflows/.gitignore deleted: documents/todo.txt modified: project/modules/CyberSentinel/.env.template modified: project/modules/CyberSentinel/preprocess/preprocessing.py new file: project/modules/CyberSentinel/utilities/logging.py modified: project/setup/orchestration/requirements.txt modified: pyproject.toml --- .github/workflows/.gitignore | 4 --- documents/todo.txt | 35 ------------------- project/modules/CyberSentinel/.env.template | 2 ++ .../CyberSentinel/preprocess/preprocessing.py | 16 +++++---- .../CyberSentinel/utilities/logging.py | 10 ++++++ project/setup/orchestration/requirements.txt | 18 +++++----- pyproject.toml | 20 ++++++----- 7 files changed, 44 insertions(+), 61 deletions(-) delete mode 100644 .github/workflows/.gitignore delete mode 100644 documents/todo.txt create mode 100644 project/modules/CyberSentinel/utilities/logging.py diff --git a/.github/workflows/.gitignore b/.github/workflows/.gitignore deleted file mode 100644 index 5bde4d8..0000000 --- a/.github/workflows/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -venv -.env -__pycache__ -logs* \ No newline at end of file diff --git a/documents/todo.txt b/documents/todo.txt deleted file mode 100644 index 2d7fcfd..0000000 --- a/documents/todo.txt +++ /dev/null @@ -1,35 +0,0 @@ -1. combine multi-hooker and multi extension converter - -2. automate the creation and storage of Ada002's embeddings - -3. create langchain, Chain, for dynamic Chain generation from user -# I am creating a natural language processor that helps people understand their enterprise's threat-landscape by providing step by step guidance for vulnerability reconaissance and remediation. -3a. Create Chain to exclude certain groups, only include certain groups -Only right-wing extremist groups are eligible for data processing. - -4. Implement Chain for cybersecurity tools like https://nmap.org/download, https://www.whois.com/whois, - -5. train binary classification model (intended-violence) -a. Collect better training data. Research hateful comments online that imply the intention to commit violence. Then, find examples that are sarcastic to train against creating False Positives further down the line -b. complete preprocessor: requires[sanitization, cleaning, standardization, tokenization, splitting,] -c. revamp or upgrade embeddings.py module -d. revamp or upgrade model_creator.py module -e. test and evaluate CyberSentinel using techniques I learned from Google courses(https://developers.google.com/machine-learning/crash-course/classification/video-lecture) - -6(2a). create similarity query functionality against Pinecone index -6a(2b). create langchain for queries - -7. Integrate CyberSentinel with OpenDTS' langchain features -# Crucial questions: -- What will orchestrate the langchain Chain calls? What LLM? Or, just how!? -- How will the model be used? In what circumstances is it appropriate for the Master Agent to decide to run something against the CyberSentinel classification model? In what circumstances should I use similarity queries against Pinecone? - -8. Study AutoGPT's web browser modules; see how adaptable/useful it'd be for this project - - -0x1. Update breakdown.md file -0x2. Update readme.md file -000x1. Create Dockerfile -000x2. Create compose.yml -000x3. Automate SetUp process by writing system-level commands in setup.py -000x3a. AND provide detail manual installation guidance in the ReadMe.md \ No newline at end of file diff --git a/project/modules/CyberSentinel/.env.template b/project/modules/CyberSentinel/.env.template index fd923b4..4389573 100644 --- a/project/modules/CyberSentinel/.env.template +++ b/project/modules/CyberSentinel/.env.template @@ -1,3 +1,5 @@ +# Create a copy of this file and name it '.env' + # Model Training Configuration TRAINING_DATA_PATH=project/modules/CyberSentinel/training-data/ LEARNING_RATE=0.001 diff --git a/project/modules/CyberSentinel/preprocess/preprocessing.py b/project/modules/CyberSentinel/preprocess/preprocessing.py index 8448c18..e16207c 100644 --- a/project/modules/CyberSentinel/preprocess/preprocessing.py +++ b/project/modules/CyberSentinel/preprocess/preprocessing.py @@ -6,6 +6,7 @@ import csv import chardet import logging +from ..utilities.logging import * from PyPDF2 import PdfFileReader from nltk.stem import WordNetLemmatizer from nltk.corpus import stopwords @@ -21,13 +22,11 @@ nltk.download('stopwords') # Point to the location of the .env file relative to the script's location -env_path = os.path.join(os.path.dirname(__file__), '../../../.env') +env_path = os.path.join(os.path.dirname(__file__), '../.env') # Load the .env file load_dotenv(dotenv_path=env_path) -logging.basicConfig(filename='preprocessing_%Y%m%d_%H%M%S.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') - class Preprocessor: def __init__(self): self.input_file_path = os.getenv('INPUT_FILE_PATH') @@ -35,7 +34,10 @@ def __init__(self): self.lemmatizer = WordNetLemmatizer() self.stop_words = set(stopwords.words('english')) - def validate_input_path(self, file_path: str) -> str: + def validate_input_path(self, file_path: str = None) -> str: + if file_path is None: + file_path = '../training-data' # Default path to target files in the training-data directory + # Check if the file path is an HTTPS link parsed_url = urlparse(file_path) if parsed_url.scheme == "https": @@ -75,11 +77,13 @@ def read_with_detected_encoding(self, file_path: str, reader_func) -> str: text = '' return text - def read_pdf(self, file, encoding: str) -> str: + def read_pdf(self, file) -> str: text = '' pdf_reader = PdfFileReader(file) for page in range(pdf_reader.getNumPages()): - text += pdf_reader.getPage(page).extractText() + page_text = pdf_reader.getPage(page).extractText() + encoding = chardet.detect(page_text.encode())['encoding'] + text += page_text.decode(encoding) return text def read_txt(self, file, encoding: str) -> List[str]: diff --git a/project/modules/CyberSentinel/utilities/logging.py b/project/modules/CyberSentinel/utilities/logging.py new file mode 100644 index 0000000..0c98a81 --- /dev/null +++ b/project/modules/CyberSentinel/utilities/logging.py @@ -0,0 +1,10 @@ +import logging +from datetime import datetime + +# Format the current datetime +current_time = datetime.now().strftime("%d%m%Y_%H%M%S") + +# Concatenate the datetime with the log filename +log_filename = f'preprocessing{current_time}.log' + +logging.basicConfig(filename=log_filename, level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s') diff --git a/project/setup/orchestration/requirements.txt b/project/setup/orchestration/requirements.txt index 2e9aff9..9ea8540 100644 --- a/project/setup/orchestration/requirements.txt +++ b/project/setup/orchestration/requirements.txt @@ -1,8 +1,10 @@ -numpy==1.24.3 -scikit-learn==0.24.2 -tensorflow-io-gcs-filesystem==0.31.0 -tensorflow==2.13.0 -pandas==1.5.3 -python-dotenv==0.19.2 -PyPDF2==2.12.1 -nltk==3.8.1 +cython +numpy +chardet +scikit-learn +tensorflow-io-gcs-filesystem +tensorflow +pandas +python-dotenv +PyPDF2 +nltk \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index a0be77e..9aa39b6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [build-system] -requires = ["poetry-core>=1.0.0"] +requires = ["poetry-core>=1.6.1"] build-backend = "poetry.core.masonry.api" [tool.poetry] @@ -9,14 +9,18 @@ description = "Open Domestic Threat Scanner - LLM Threat-Intelligence" authors = ["Daethyra dev-daethyra@protonmail.com"] [tool.poetry.dependencies] -python = ">=3.11.3,<3.12" -numpy = ">=1.22,<=1.24.3" -scikit-learn = "^0.24.2" -tensorflow-io-gcs-filesystem = "^0.31.0" +python = ">3.9,<3.12.0" +wheel = "^0.41.1" +chardet = "^5.2.0" +setuptools = "^60.0" +numpy = "^1.25.2" +cython = "^3.0.0" +scikit-learn = "^1.3.0" +tensorflow-io-gcs-filesystem = "^0.33.0" tensorflow = "^2.13.0" -pandas = "^1.5.3" -python-dotenv = "^0.19.2" -PyPDF2 = "^2.12.1" +pandas = "^2.0.3" +python-dotenv = "^1.0.0" +PyPDF2 = "^3.15.0" nltk = "^3.8.1" [tool.poetry.dev-dependencies] From 1a34c99764e0c952c7c8af4a1dc4ceabe5995076 Mon Sep 17 00:00:00 2001 From: Daemon <109057945+Daethyra@users.noreply.github.com> Date: Thu, 17 Aug 2023 20:35:22 -0700 Subject: [PATCH 13/17] modified: project/modules/CyberSentinel/preprocess/preprocessing.py --- .../CyberSentinel/preprocess/preprocessing.py | 26 ++++++++++--------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/project/modules/CyberSentinel/preprocess/preprocessing.py b/project/modules/CyberSentinel/preprocess/preprocessing.py index e16207c..626ee3a 100644 --- a/project/modules/CyberSentinel/preprocess/preprocessing.py +++ b/project/modules/CyberSentinel/preprocess/preprocessing.py @@ -1,21 +1,23 @@ """ Defines functions for ingesting files, lemmatizes and removing stop words, and tokenization. """ -from dotenv import load_dotenv +import csv import os import re -import csv -import chardet -import logging -from ..utilities.logging import * -from PyPDF2 import PdfFileReader -from nltk.stem import WordNetLemmatizer -from nltk.corpus import stopwords -import nltk -from typing import List +import shutil +import urllib.request from datetime import datetime +from typing import List from urllib.parse import urlparse -import urllib.request -import shutil + +import chardet +import nltk +from dotenv import load_dotenv +from nltk.corpus import stopwords +from nltk.stem import WordNetLemmatizer +from PyPDF2 import PdfFileReader + +from utilities.logging import * + # Downloading NLTK resources if not already present nltk.download('wordnet') From 12d7bd05952f998992fe2d4e1c8523c8ac62ae4f Mon Sep 17 00:00:00 2001 From: Daemon <109057945+Daethyra@users.noreply.github.com> Date: Fri, 18 Aug 2023 17:55:59 -0700 Subject: [PATCH 14/17] Moved orchestration folder to the modules directory --- project/{setup => modules}/orchestration/Dockerfile | 0 project/{setup => modules}/orchestration/docker-compose.yml | 0 project/{setup => modules}/orchestration/requirements.txt | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename project/{setup => modules}/orchestration/Dockerfile (100%) rename project/{setup => modules}/orchestration/docker-compose.yml (100%) rename project/{setup => modules}/orchestration/requirements.txt (100%) diff --git a/project/setup/orchestration/Dockerfile b/project/modules/orchestration/Dockerfile similarity index 100% rename from project/setup/orchestration/Dockerfile rename to project/modules/orchestration/Dockerfile diff --git a/project/setup/orchestration/docker-compose.yml b/project/modules/orchestration/docker-compose.yml similarity index 100% rename from project/setup/orchestration/docker-compose.yml rename to project/modules/orchestration/docker-compose.yml diff --git a/project/setup/orchestration/requirements.txt b/project/modules/orchestration/requirements.txt similarity index 100% rename from project/setup/orchestration/requirements.txt rename to project/modules/orchestration/requirements.txt From cfdbfaf14efd0e417ead74c366f6956c808301b5 Mon Sep 17 00:00:00 2001 From: Daemon <109057945+Daethyra@users.noreply.github.com> Date: Fri, 18 Aug 2023 21:37:04 -0700 Subject: [PATCH 15/17] - Moved documents folder to the project directory. - Updated dependabot files --- .github/dependabot-misc.yml | 4 ++-- .github/dependabot-python.yml | 4 ++-- {documents => project/documents}/LICENSE | 0 .../supplementary-information/Bad-Words/ListOfDirtyWords.txt | 0 .../Bad-Words/VIOLENT_TERRORIST_WORDS.txt | 0 .../supplementary-information/Bad-Words/bad_Words_list.txt | 0 .../supplementary-information/Bad-Words/badwords.txt | 0 .../supplementary-information/Bad-Words/citations.md | 0 .../supplementary-information/Bad-Words/cmu-bad-words.txt | 0 .../Conservative News Domains/CND_s1.txt | 0 .../documents}/supplementary-information/explanation.txt | 0 11 files changed, 4 insertions(+), 4 deletions(-) rename {documents => project/documents}/LICENSE (100%) rename {documents => project/documents}/supplementary-information/Bad-Words/ListOfDirtyWords.txt (100%) rename {documents => project/documents}/supplementary-information/Bad-Words/VIOLENT_TERRORIST_WORDS.txt (100%) rename {documents => project/documents}/supplementary-information/Bad-Words/bad_Words_list.txt (100%) rename {documents => project/documents}/supplementary-information/Bad-Words/badwords.txt (100%) rename {documents => project/documents}/supplementary-information/Bad-Words/citations.md (100%) rename {documents => project/documents}/supplementary-information/Bad-Words/cmu-bad-words.txt (100%) rename {documents => project/documents}/supplementary-information/Conservative News Domains/CND_s1.txt (100%) rename {documents => project/documents}/supplementary-information/explanation.txt (100%) diff --git a/.github/dependabot-misc.yml b/.github/dependabot-misc.yml index e4e964e..f14bbfe 100644 --- a/.github/dependabot-misc.yml +++ b/.github/dependabot-misc.yml @@ -1,7 +1,7 @@ # Enable version updates for npm - package-ecosystem: "npm" # Look for `package.json` and `lock` files in the `root` directory - directory: "/project/modules/" + directory: "/project/modules/web-ui" # MUST BE UPDATED UPON TYPESCRIPT PROGRAMMING # Check the npm registry for updates every day (weekdays) schedule: interval: "daily" @@ -14,7 +14,7 @@ # Enable version updates for Docker - package-ecosystem: "docker" # Look for a `Dockerfile` in the `root` directory - directory: "/" + directory: "/project/modules/orchestration" # Check for updates once a week schedule: interval: "daily" diff --git a/.github/dependabot-python.yml b/.github/dependabot-python.yml index 1e6da62..3107818 100644 --- a/.github/dependabot-python.yml +++ b/.github/dependabot-python.yml @@ -4,7 +4,7 @@ version: 2 updates: # Enable version updates for multiple branches - package-ecosystem: "pip" - directory: "/" + directory: "project/modules/orchestration" schedule: interval: "daily" # Check for pip updates at 9am UTC @@ -14,7 +14,7 @@ updates: versioning-strategy: auto - package-ecosystem: "pip" - directory: "/" + directory: "project/modules/orchestration" schedule: interval: "daily" # Check for pip updates at 9am UTC diff --git a/documents/LICENSE b/project/documents/LICENSE similarity index 100% rename from documents/LICENSE rename to project/documents/LICENSE diff --git a/documents/supplementary-information/Bad-Words/ListOfDirtyWords.txt b/project/documents/supplementary-information/Bad-Words/ListOfDirtyWords.txt similarity index 100% rename from documents/supplementary-information/Bad-Words/ListOfDirtyWords.txt rename to project/documents/supplementary-information/Bad-Words/ListOfDirtyWords.txt diff --git a/documents/supplementary-information/Bad-Words/VIOLENT_TERRORIST_WORDS.txt b/project/documents/supplementary-information/Bad-Words/VIOLENT_TERRORIST_WORDS.txt similarity index 100% rename from documents/supplementary-information/Bad-Words/VIOLENT_TERRORIST_WORDS.txt rename to project/documents/supplementary-information/Bad-Words/VIOLENT_TERRORIST_WORDS.txt diff --git a/documents/supplementary-information/Bad-Words/bad_Words_list.txt b/project/documents/supplementary-information/Bad-Words/bad_Words_list.txt similarity index 100% rename from documents/supplementary-information/Bad-Words/bad_Words_list.txt rename to project/documents/supplementary-information/Bad-Words/bad_Words_list.txt diff --git a/documents/supplementary-information/Bad-Words/badwords.txt b/project/documents/supplementary-information/Bad-Words/badwords.txt similarity index 100% rename from documents/supplementary-information/Bad-Words/badwords.txt rename to project/documents/supplementary-information/Bad-Words/badwords.txt diff --git a/documents/supplementary-information/Bad-Words/citations.md b/project/documents/supplementary-information/Bad-Words/citations.md similarity index 100% rename from documents/supplementary-information/Bad-Words/citations.md rename to project/documents/supplementary-information/Bad-Words/citations.md diff --git a/documents/supplementary-information/Bad-Words/cmu-bad-words.txt b/project/documents/supplementary-information/Bad-Words/cmu-bad-words.txt similarity index 100% rename from documents/supplementary-information/Bad-Words/cmu-bad-words.txt rename to project/documents/supplementary-information/Bad-Words/cmu-bad-words.txt diff --git a/documents/supplementary-information/Conservative News Domains/CND_s1.txt b/project/documents/supplementary-information/Conservative News Domains/CND_s1.txt similarity index 100% rename from documents/supplementary-information/Conservative News Domains/CND_s1.txt rename to project/documents/supplementary-information/Conservative News Domains/CND_s1.txt diff --git a/documents/supplementary-information/explanation.txt b/project/documents/supplementary-information/explanation.txt similarity index 100% rename from documents/supplementary-information/explanation.txt rename to project/documents/supplementary-information/explanation.txt From 6b5e485421ccb7b359d8f671453730de00676491 Mon Sep 17 00:00:00 2001 From: Daemon <109057945+Daethyra@users.noreply.github.com> Date: Fri, 18 Aug 2023 22:13:43 -0700 Subject: [PATCH 16/17] Fixed: pyproject.toml --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 9aa39b6..8ae705a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,10 +17,10 @@ numpy = "^1.25.2" cython = "^3.0.0" scikit-learn = "^1.3.0" tensorflow-io-gcs-filesystem = "^0.33.0" -tensorflow = "^2.13.0" +tensorflow = "^2.14.0rc0" pandas = "^2.0.3" python-dotenv = "^1.0.0" -PyPDF2 = "^3.15.0" +PyPDF2 = "^2.12.1" nltk = "^3.8.1" [tool.poetry.dev-dependencies] From d24d029b887c49ff10dbd455abb0945bdc8647ec Mon Sep 17 00:00:00 2001 From: Daemon <109057945+Daethyra@users.noreply.github.com> Date: Sat, 19 Aug 2023 08:18:09 -0700 Subject: [PATCH 17/17] Update readme.md Removed old dev thoughts --- readme.md | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/readme.md b/readme.md index 9d250a8..08a4ee1 100644 --- a/readme.md +++ b/readme.md @@ -4,15 +4,4 @@ TLDR: This project uses a multi-layered approach to predict whether or not someo OpenDTS (Domestic Threat Scanner) is a project intended to provide analytical insight into hostile sentiment towards a given demographic experiencing discrimination. -Users may create profiles to help them track sentiment towards one group. - ---- - -## Developer's Thoughts - -The application will do its best to accurately discern true intent to harm versus sarcasm, frustration without the intent to harm, and will make key decisions on multiple different layers comprised of entire models. Like the Ada-002 from OpenAI, for example. - -Ideally, the first actual deployment will be a heatmap serving real-time processing data for whatever I choose. - -- OpenDTS is the manifestation of my attempt to help others protect themselves -- I want it to be a comprehensive open-source cyberspace threat-intelligence platform +Users may create profiles to help them track sentiment towards one group. \ No newline at end of file