From 771b73bff11540c3bb66fdb35cc73fa6c1c7bd9b Mon Sep 17 00:00:00 2001
From: Daemon <109057945+Daethyra@users.noreply.github.com>
Date: Mon, 14 Aug 2023 15:37:00 -0700
Subject: [PATCH 01/17] Updated set up files

---
 project/setup/orchestration/requirements.txt | 26 +-----
 project/setup/pyproject.toml                 | 85 ++------------------
 2 files changed, 10 insertions(+), 101 deletions(-)

diff --git a/project/setup/orchestration/requirements.txt b/project/setup/orchestration/requirements.txt
index 798db88..1dc3f15 100644
--- a/project/setup/orchestration/requirements.txt
+++ b/project/setup/orchestration/requirements.txt
@@ -1,27 +1,5 @@
-python-dotenv
-threadpoolctl
-numpy
-joblib
-scipy
-scikit-learn==1.3.0
-pytz
-python-stdnum
-phonenumbers
-tzdata
-tqdm
-python-dateutil
-click
-nltk
-faker
-dateparser
-textblob
-scrubadub
-chardet
+scikit-learn
 tensorflow
-keras
-pinecone-client
 pandas
-openai
-langchain
 PyPDF2
-xlrd
\ No newline at end of file
+nltk
\ No newline at end of file
diff --git a/project/setup/pyproject.toml b/project/setup/pyproject.toml
index 200bc17..cfc9aae 100644
--- a/project/setup/pyproject.toml
+++ b/project/setup/pyproject.toml
@@ -9,83 +9,14 @@ description = "Description"
 authors = ["Daethyra dev-daethyra@protonmail.com"]
 
 [tool.poetry.dependencies]
-python = "^3.8"
-python-dotenv = "*"
-threadpoolctl = "*"
-numpy = "*"
-joblib = "*"
-scipy = "*"
-scikit-learn = "1.3.0"
-pytz = "*"
-python-stdnum = "*"
-phonenumbers = "*"
-tzdata = "*"
-tqdm = "*"
-python-dateutil = "*"
-click = "*"
-nltk = "*"
-faker = "*"
-dateparser = "*"
-textblob = "*"
-scrubadub = "*"
-chardet = "*"
-tensorflow = "*"
-keras = "*"
-pinecone-client = "*"
-pandas = "*"
-openai = "*"
-langchain = "*"
-PyPDF2 = "*"
-xlrd = "*"
-beautifulsoup4 = ">=4.12.2"
-colorama = "0.4.6"
-distro = "1.8.0"
-playsound = "1.2.2"
-pyyaml = "6.0.1"
-python-docx = "*"
-markdown = "*"
-pylatexenc = "*"
-readability-lxml = "0.8.1"
-requests = "*"
-tiktoken = "0.4.0"
-gTTS = "2.3.2"
-docker = "*"
-duckduckgo-search = "^3.8.4"
-google-api-python-client = "*"
-redis = "*"
-orjson = "3.8.10"
-Pillow = "*"
-selenium = "4.11.2"
-webdriver-manager = "*"
-jsonschema = "*"
-charset-normalizer = ">=3.1.0"
-spacy = ">=3.0.0,<4.0.0"
-prompt-toolkit = ">=3.0.38"
-pydantic = "*"
-inflection = "*"
-fastapi = "*"
-uvicorn = "*"
-coverage = "*"
-flake8 = "*"
-pre-commit = "*"
-black = "*"
-isort = "*"
-gitpython = "3.1.32"
-mkdocs = "*"
-pymdown-extensions = "*"
-mypy = "*"
-types-Markdown = "*"
-types-beautifulsoup4 = "*"
-types-colorama = "*"
-types-Pillow = "*"
-openapi-python-client = "0.15.0"
-pytest = "*"
-asynctest = "*"
-pytest-asyncio = "*"
-pytest-benchmark = "*"
-pytest-cov = "*"
-pytest-integration = "*"
-pytest-mock = "*"
+[tool.poetry.dependencies]
+python = "^3.11.3"
+scikit-learn = "^0.24.2"
+tensorflow = "^2.6.0"
+pandas = "^1.3.3"
+python-dotenv = "^0.19.1"
+PyPDF2 = "^2.3.1"
+nltk = "^3.8.1"
 
 [tool.poetry.dev-dependencies]
 

From eba9bf4d2b6c5a3d1aaa263767475ecc0933e23c Mon Sep 17 00:00:00 2001
From: Daemon <109057945+Daethyra@users.noreply.github.com>
Date: Mon, 14 Aug 2023 18:24:11 -0700
Subject: [PATCH 02/17] 	modified:  
 project/modules/CyberSentinel/.env.template 	new file:  
 project/modules/CyberSentinel/preprocess/__init__.py 	modified:  
 project/modules/CyberSentinel/preprocess/data_labeler.py 	modified:  
 project/modules/CyberSentinel/preprocess/preprocessing.py

---
 project/modules/CyberSentinel/.env.template   | 26 ++++++++++++++++++-
 .../CyberSentinel/preprocess/__init__.py      |  0
 .../CyberSentinel/preprocess/data_labeler.py  | 26 ++++++++++++-------
 .../CyberSentinel/preprocess/preprocessing.py | 15 +++++++++--
 4 files changed, 55 insertions(+), 12 deletions(-)
 create mode 100644 project/modules/CyberSentinel/preprocess/__init__.py

diff --git a/project/modules/CyberSentinel/.env.template b/project/modules/CyberSentinel/.env.template
index 44f9c90..fe186ea 100644
--- a/project/modules/CyberSentinel/.env.template
+++ b/project/modules/CyberSentinel/.env.template
@@ -1,5 +1,29 @@
-TRAINING_DATA_PATH=
+# Model Training Configuration
+TRAINING_DATA_PATH=project/training-data/
 LEARNING_RATE=0.001
 BATCH_SIZE=32
 EPOCHS=10
 L2_REG=0.01
+
+# Temporary file paths for DataLabeler
+TEMP_PDF_FILE_PATH=temp_pdf_data.csv
+TEMP_TXT_FILE_PATH=temp_txt_data.csv
+
+# Path to save labeled data
+LABELED_DATA_FILE_PATH=
+
+# Preprocessor Configuration (Use '.' for current working directory)
+INPUT_FILE_PATH=..
+PREPROCESSED_DATA_FILE_PATH=
+
+
+TRAINING_DATA_PATH=/absolute/path/to/training_data.csv
+LEARNING_RATE=0.001
+BATCH_SIZE=32
+EPOCHS=10
+L2_REG=0.01
+TEMP_PDF_FILE_PATH=/absolute/path/to/temp_pdf_data.csv
+TEMP_TXT_FILE_PATH=/absolute/path/to/temp_txt_data.csv
+LABELED_DATA_FILE_PATH=/absolute/path/to/labeled_data.csv
+INPUT_FILE_PATH=/absolute/path/to/input_files
+PREPROCESSED_DATA_FILE_PATH=/absolute/path/to/preprocessed_data
diff --git a/project/modules/CyberSentinel/preprocess/__init__.py b/project/modules/CyberSentinel/preprocess/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/project/modules/CyberSentinel/preprocess/data_labeler.py b/project/modules/CyberSentinel/preprocess/data_labeler.py
index c230deb..c4bae7e 100644
--- a/project/modules/CyberSentinel/preprocess/data_labeler.py
+++ b/project/modules/CyberSentinel/preprocess/data_labeler.py
@@ -1,11 +1,20 @@
+from dotenv import load_dotenv
 import os
 import csv
 from typing import List, Tuple
 
+# Point to the location of the .env file relative to the script's location
+env_path = os.path.join(os.path.dirname(__file__), '../../../.env')
+
+# Load the .env file
+load_dotenv(dotenv_path=env_path)
+
 class DataLabeler:
-    def __init__(self, temp_pdf_file_path: str = "temp_pdf_data.csv", temp_txt_file_path: str = "temp_txt_data.csv"):
-        self.temp_pdf_file_path = temp_pdf_file_path
-        self.temp_txt_file_path = temp_txt_file_path
+    def __init__(self):
+        default_temp_path = os.path.dirname(__file__)
+        self.temp_pdf_file_path = os.getenv('TEMP_PDF_FILE_PATH', os.path.join(default_temp_path, 'temp_pdf_data.csv'))
+        self.temp_txt_file_path = os.getenv('TEMP_TXT_FILE_PATH', os.path.join(default_temp_path, 'temp_txt_data.csv'))
+        self.output_file_path = os.getenv('LABELED_DATA_FILE_PATH')
         self.labeled_pdf_data = self.load_temp_data(self.temp_pdf_file_path)
         self.labeled_txt_data = self.load_temp_data(self.temp_txt_file_path)
 
@@ -36,13 +45,13 @@ def load_temp_data(self, file_path: str) -> List[Tuple[str, bool]]:
     def label_data(self, data: List[str]) -> List[Tuple[str, bool]]:
         labeled_data = []
         for text in data:
-            print("\nSample:")
-            print(text)
+            print(f"\\nSample:{text}")
             label = self.get_user_input("Does this text indicate the intention to commit acts of hate-based violence? (True/False): ")
             labeled_data.append((text, label))
         return labeled_data
 
-    def save_labeled_data_to_csv(self, labeled_data: List[Tuple[str, bool]], file_path: str):
+    def save_labeled_data_to_csv(self, labeled_data: List[Tuple[str, bool]]):
+        file_path = self.output_file_path or input("Enter the path to save the labeled data: ")
         with open(file_path, 'w', newline='', encoding='utf-8') as file:
             writer = csv.writer(file)
             writer.writerow(['text', 'label'])
@@ -52,12 +61,11 @@ def save_labeled_data_to_csv(self, labeled_data: List[Tuple[str, bool]], file_pa
 if __name__ == "__main__":
     data_labeler = DataLabeler()
     # Load the preprocessed data from the file saved by the Preprocessor
-    file_path = input("Enter the path to the preprocessed data file: ")
+    file_path = os.getenv('PREPROCESSED_DATA_FILE_PATH') or input("Enter the path to the preprocessed data file: ")
     with open(file_path, 'r', encoding='utf-8') as file:
         reader = csv.reader(file)
         next(reader)  # Skip the header
         data = [row[0] for row in reader]
 
     labeled_data = data_labeler.label_data(data)
-    output_file_path = input("Enter the path to save the labeled data: ")
-    data_labeler.save_labeled_data_to_csv(labeled_data, output_file_path)
+    data_labeler.save_labeled_data_to_csv(labeled_data)
diff --git a/project/modules/CyberSentinel/preprocess/preprocessing.py b/project/modules/CyberSentinel/preprocess/preprocessing.py
index 8ddfe5b..3eba7bd 100644
--- a/project/modules/CyberSentinel/preprocess/preprocessing.py
+++ b/project/modules/CyberSentinel/preprocess/preprocessing.py
@@ -1,7 +1,9 @@
 """ Defines functions for ingesting files, lemmatizes and removeing stop words, and tokenization. """
 
+from dotenv import load_dotenv
 import os
 import re
+import csv
 from PyPDF2 import PdfFileReader
 from nltk.stem import WordNetLemmatizer
 from nltk.corpus import stopwords
@@ -13,8 +15,17 @@
 nltk.download('wordnet')
 nltk.download('stopwords')
 
+# Point to the location of the .env file relative to the script's location
+env_path = os.path.join(os.path.dirname(__file__), '../../../.env')
+
+# Load the .env file
+load_dotenv(dotenv_path=env_path)
+
 class Preprocessor:
     def __init__(self):
+        default_temp_path = os.path.dirname(__file__)
+        self.input_file_path = os.getenv('INPUT_FILE_PATH')
+        self.output_file_path = os.getenv('PREPROCESSED_DATA_FILE_PATH', f"processed_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv")
         self.lemmatizer = WordNetLemmatizer()
         self.stop_words = set(stopwords.words('english'))
 
@@ -75,7 +86,7 @@ def load_data(self, file_path: str) -> List[str]:
 
     def save_processed_data(self, processed_data: List[str], file_type: str = "csv"):
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        file_path = f"processed_data_{timestamp}.{file_type}"
+        file_path = os.getenv('PREPROCESSED_DATA_FILE_PATH', f"processed_data_{timestamp}.{file_type}")
         with open(file_path, 'w', newline='', encoding='utf-8') as file:
             writer = csv.writer(file)
             writer.writerow(['text'])
@@ -85,6 +96,6 @@ def save_processed_data(self, processed_data: List[str], file_type: str = "csv")
 
 if __name__ == "__main__":
     preprocessor = Preprocessor()
-    file_path = input("Enter the path to the file or '.' to process all PDF and TXT files in the current directory: ")
+    file_path = os.getenv('INPUT_FILE_PATH') or input("Enter the path to the file or '.' to process all PDF and TXT files in the current directory: ")
     processed_data = preprocessor.load_data(file_path)
     preprocessor.save_processed_data(processed_data)
\ No newline at end of file

From e9497582a0b9371525d25b934a769fb63d7d5962 Mon Sep 17 00:00:00 2001
From: Daemon <109057945+Daethyra@users.noreply.github.com>
Date: Mon, 14 Aug 2023 18:30:38 -0700
Subject: [PATCH 03/17] 	modified:  
 project/modules/CyberSentinel/.env.template

---
 project/modules/CyberSentinel/.env.template | 24 ++++++---------------
 1 file changed, 6 insertions(+), 18 deletions(-)

diff --git a/project/modules/CyberSentinel/.env.template b/project/modules/CyberSentinel/.env.template
index fe186ea..fd923b4 100644
--- a/project/modules/CyberSentinel/.env.template
+++ b/project/modules/CyberSentinel/.env.template
@@ -1,29 +1,17 @@
 # Model Training Configuration
-TRAINING_DATA_PATH=project/training-data/
+TRAINING_DATA_PATH=project/modules/CyberSentinel/training-data/
 LEARNING_RATE=0.001
 BATCH_SIZE=32
 EPOCHS=10
 L2_REG=0.01
 
+# Preprocessor Configuration (Use '.' for current working directory)
+INPUT_FILE_PATH=
+PREPROCESSED_DATA_FILE_PATH=project/modules/CyberSentinel/training-data/Processed-Data/
+
 # Temporary file paths for DataLabeler
 TEMP_PDF_FILE_PATH=temp_pdf_data.csv
 TEMP_TXT_FILE_PATH=temp_txt_data.csv
 
 # Path to save labeled data
-LABELED_DATA_FILE_PATH=
-
-# Preprocessor Configuration (Use '.' for current working directory)
-INPUT_FILE_PATH=..
-PREPROCESSED_DATA_FILE_PATH=
-
-
-TRAINING_DATA_PATH=/absolute/path/to/training_data.csv
-LEARNING_RATE=0.001
-BATCH_SIZE=32
-EPOCHS=10
-L2_REG=0.01
-TEMP_PDF_FILE_PATH=/absolute/path/to/temp_pdf_data.csv
-TEMP_TXT_FILE_PATH=/absolute/path/to/temp_txt_data.csv
-LABELED_DATA_FILE_PATH=/absolute/path/to/labeled_data.csv
-INPUT_FILE_PATH=/absolute/path/to/input_files
-PREPROCESSED_DATA_FILE_PATH=/absolute/path/to/preprocessed_data
+LABELED_DATA_FILE_PATH=project/modules/CyberSentinel/preprocess/
\ No newline at end of file

From b78c9e4a234a2a38daf5721e5e1cb8a6792b04ad Mon Sep 17 00:00:00 2001
From: Daemon <109057945+Daethyra@users.noreply.github.com>
Date: Mon, 14 Aug 2023 21:52:38 -0700
Subject: [PATCH 04/17] 	modified:   project/setup/orchestration/Dockerfile

---
 project/setup/orchestration/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/project/setup/orchestration/Dockerfile b/project/setup/orchestration/Dockerfile
index 5928fbb..6a07a02 100644
--- a/project/setup/orchestration/Dockerfile
+++ b/project/setup/orchestration/Dockerfile
@@ -18,7 +18,7 @@ COPY requirements.txt /app/requirements.txt
 
 # Update pip and install any needed packages specified in requirements.txt
 RUN pip install --upgrade pip
-RUN pip install --trusted-host pypi.python.org -r requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt
 
 # Second stage: Create the final image
 FROM python:3.11-slim

From 7d0157c1640f142f51f298b9d3fd5cd0b1a709a6 Mon Sep 17 00:00:00 2001
From: Daemon <109057945+Daethyra@users.noreply.github.com>
Date: Wed, 16 Aug 2023 21:29:42 -0700
Subject: [PATCH 05/17] 	modified:   .github/dependabot.yml 	renamed:   
 project/setup/orchestration/Dockerfile -> Dockerfile 	new file:  
 docker-compose.yml 	renamed:   
 documents/supplementary-docs/Bad-Words/ListOfDirtyWords.txt ->
 documents/supplementary-information/Bad-Words/ListOfDirtyWords.txt 
 renamed:   
 documents/supplementary-docs/Bad-Words/VIOLENT_TERRORIST_WORDS.txt ->
 documents/supplementary-information/Bad-Words/VIOLENT_TERRORIST_WORDS.txt 
 renamed:    documents/supplementary-docs/Bad-Words/bad_Words_list.txt ->
 documents/supplementary-information/Bad-Words/bad_Words_list.txt 
 renamed:    documents/supplementary-docs/Bad-Words/badwords.txt ->
 documents/supplementary-information/Bad-Words/badwords.txt 	renamed:   
 documents/supplementary-docs/Bad-Words/citations.md ->
 documents/supplementary-information/Bad-Words/citations.md 	renamed:   
 documents/supplementary-docs/Bad-Words/cmu-bad-words.txt ->
 documents/supplementary-information/Bad-Words/cmu-bad-words.txt     renamed: 
   documents/supplementary-docs/Conservative News Domains/CND_s1.txt ->
 documents/supplementary-information/Conservative News Domains/CND_s1.txt 
 new file:   documents/supplementary-information/explanation.txt 
 modified:   project/modules/CyberSentinel/preprocess/preprocessing.py 
 deleted:    project/modules/CyberSentinel/web-ui 	deleted:   
 project/setup/orchestration/docker-compose.yml 	deleted:   
 project/setup/orchestration/requirements.txt 	deleted:   
 project/setup/setup.py 	deleted:    project/tests/not_inprogress.txt 
 renamed:    project/setup/pyproject.toml -> pyproject.toml

---
 .github/dependabot.yml                        |  4 +-
 .../orchestration/Dockerfile => Dockerfile    |  0
 docker-compose.yml                            | 10 +++
 .../Bad-Words/ListOfDirtyWords.txt            |  0
 .../Bad-Words/VIOLENT_TERRORIST_WORDS.txt     |  0
 .../Bad-Words/bad_Words_list.txt              |  0
 .../Bad-Words/badwords.txt                    |  0
 .../Bad-Words/citations.md                    |  0
 .../Bad-Words/cmu-bad-words.txt               |  0
 .../Conservative News Domains/CND_s1.txt      |  0
 .../supplementary-information/explanation.txt |  2 +
 .../CyberSentinel/preprocess/preprocessing.py | 80 ++++++++++++-------
 project/modules/CyberSentinel/web-ui          |  1 -
 .../setup/orchestration/docker-compose.yml    | 44 ----------
 project/setup/orchestration/requirements.txt  |  5 --
 project/setup/setup.py                        |  5 --
 project/tests/not_inprogress.txt              |  1 -
 .../setup/pyproject.toml => pyproject.toml    | 18 +++--
 18 files changed, 73 insertions(+), 97 deletions(-)
 rename project/setup/orchestration/Dockerfile => Dockerfile (100%)
 create mode 100644 docker-compose.yml
 rename documents/{supplementary-docs => supplementary-information}/Bad-Words/ListOfDirtyWords.txt (100%)
 rename documents/{supplementary-docs => supplementary-information}/Bad-Words/VIOLENT_TERRORIST_WORDS.txt (100%)
 rename documents/{supplementary-docs => supplementary-information}/Bad-Words/bad_Words_list.txt (100%)
 rename documents/{supplementary-docs => supplementary-information}/Bad-Words/badwords.txt (100%)
 rename documents/{supplementary-docs => supplementary-information}/Bad-Words/citations.md (100%)
 rename documents/{supplementary-docs => supplementary-information}/Bad-Words/cmu-bad-words.txt (100%)
 rename documents/{supplementary-docs => supplementary-information}/Conservative News Domains/CND_s1.txt (100%)
 create mode 100644 documents/supplementary-information/explanation.txt
 delete mode 160000 project/modules/CyberSentinel/web-ui
 delete mode 100644 project/setup/orchestration/docker-compose.yml
 delete mode 100644 project/setup/orchestration/requirements.txt
 delete mode 100644 project/setup/setup.py
 delete mode 100644 project/tests/not_inprogress.txt
 rename project/setup/pyproject.toml => pyproject.toml (52%)

diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index ceba556..d831a2f 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -20,7 +20,7 @@ updates:
       # Check for npm updates at 9am UTC
       time: "09:00"
       timezone: "America/Los_Angeles"
-    target-branch: "v1.3.2"
+    target-branch: "v1.5"
     versioning-strategy: auto
 
   - package-ecosystem: "pip" 
@@ -30,7 +30,7 @@ updates:
       # Check for npm updates at 9am UTC
       time: "09:00"
       timezone: "America/Los_Angeles"
-    target-branch: "v1.3.3"
+    target-branch: "v1.5.1"
     versioning-strategy: auto
 
   
diff --git a/project/setup/orchestration/Dockerfile b/Dockerfile
similarity index 100%
rename from project/setup/orchestration/Dockerfile
rename to Dockerfile
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000..60f5913
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,10 @@
+version: '3.9'
+
+services:
+  opendts:
+    build:
+      context: ./
+      dockerfile: Dockerfile
+      target: builder
+    volumes:
+      - ./:/app
\ No newline at end of file
diff --git a/documents/supplementary-docs/Bad-Words/ListOfDirtyWords.txt b/documents/supplementary-information/Bad-Words/ListOfDirtyWords.txt
similarity index 100%
rename from documents/supplementary-docs/Bad-Words/ListOfDirtyWords.txt
rename to documents/supplementary-information/Bad-Words/ListOfDirtyWords.txt
diff --git a/documents/supplementary-docs/Bad-Words/VIOLENT_TERRORIST_WORDS.txt b/documents/supplementary-information/Bad-Words/VIOLENT_TERRORIST_WORDS.txt
similarity index 100%
rename from documents/supplementary-docs/Bad-Words/VIOLENT_TERRORIST_WORDS.txt
rename to documents/supplementary-information/Bad-Words/VIOLENT_TERRORIST_WORDS.txt
diff --git a/documents/supplementary-docs/Bad-Words/bad_Words_list.txt b/documents/supplementary-information/Bad-Words/bad_Words_list.txt
similarity index 100%
rename from documents/supplementary-docs/Bad-Words/bad_Words_list.txt
rename to documents/supplementary-information/Bad-Words/bad_Words_list.txt
diff --git a/documents/supplementary-docs/Bad-Words/badwords.txt b/documents/supplementary-information/Bad-Words/badwords.txt
similarity index 100%
rename from documents/supplementary-docs/Bad-Words/badwords.txt
rename to documents/supplementary-information/Bad-Words/badwords.txt
diff --git a/documents/supplementary-docs/Bad-Words/citations.md b/documents/supplementary-information/Bad-Words/citations.md
similarity index 100%
rename from documents/supplementary-docs/Bad-Words/citations.md
rename to documents/supplementary-information/Bad-Words/citations.md
diff --git a/documents/supplementary-docs/Bad-Words/cmu-bad-words.txt b/documents/supplementary-information/Bad-Words/cmu-bad-words.txt
similarity index 100%
rename from documents/supplementary-docs/Bad-Words/cmu-bad-words.txt
rename to documents/supplementary-information/Bad-Words/cmu-bad-words.txt
diff --git a/documents/supplementary-docs/Conservative News Domains/CND_s1.txt b/documents/supplementary-information/Conservative News Domains/CND_s1.txt
similarity index 100%
rename from documents/supplementary-docs/Conservative News Domains/CND_s1.txt
rename to documents/supplementary-information/Conservative News Domains/CND_s1.txt
diff --git a/documents/supplementary-information/explanation.txt b/documents/supplementary-information/explanation.txt
new file mode 100644
index 0000000..1665214
--- /dev/null
+++ b/documents/supplementary-information/explanation.txt
@@ -0,0 +1,2 @@
+These subdirectories contain contextual information for the project.
+- The AI may make use of everything inside 'supplementary-information/'
\ No newline at end of file
diff --git a/project/modules/CyberSentinel/preprocess/preprocessing.py b/project/modules/CyberSentinel/preprocess/preprocessing.py
index 3eba7bd..9d4ac11 100644
--- a/project/modules/CyberSentinel/preprocess/preprocessing.py
+++ b/project/modules/CyberSentinel/preprocess/preprocessing.py
@@ -1,9 +1,11 @@
-""" Defines functions for ingesting files, lemmatizes and removeing stop words, and tokenization. """
+""" Defines functions for ingesting files, lemmatizes and removing stop words, and tokenization. """
 
 from dotenv import load_dotenv
 import os
 import re
 import csv
+import chardet
+import logging
 from PyPDF2 import PdfFileReader
 from nltk.stem import WordNetLemmatizer
 from nltk.corpus import stopwords
@@ -21,41 +23,48 @@
 # Load the .env file
 load_dotenv(dotenv_path=env_path)
 
+logging.basicConfig(filename='preprocessing_%Y%m%d_%H%M%S.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
 class Preprocessor:
     def __init__(self):
-        default_temp_path = os.path.dirname(__file__)
         self.input_file_path = os.getenv('INPUT_FILE_PATH')
         self.output_file_path = os.getenv('PREPROCESSED_DATA_FILE_PATH', f"processed_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv")
         self.lemmatizer = WordNetLemmatizer()
         self.stop_words = set(stopwords.words('english'))
 
-    def read_pdf(self, file_path: str) -> str:
-        text = ''
+    def read_file(self, file_path: str, reader_func) -> str:
+        result = ''
         if file_path == '.':
-            pdf_files = [f for f in os.listdir() if os.path.isfile(f) and f.lower().endswith('.pdf')]
-            for pdf_file in pdf_files:
-                with open(pdf_file, 'rb') as file:
-                    pdf_reader = PdfFileReader(file)
-                    for page in range(pdf_reader.getNumPages()):
-                        text += pdf_reader.getPage(page).extractText()
+            for root, _, files in os.walk(file_path):
+                for file_name in files:
+                    full_path = os.path.join(root, file_name)
+                    result += self.read_with_detected_encoding(full_path, reader_func)
         else:
-            with open(file_path, 'rb') as file:
-                pdf_reader = PdfFileReader(file)
-                for page in range(pdf_reader.getNumPages()):
-                    text += pdf_reader.getPage(page).extractText()
+            result = self.read_with_detected_encoding(file_path, reader_func)
+        return result
+
+    def read_with_detected_encoding(self, file_path: str, reader_func) -> str:
+        with open(file_path, 'rb') as file:
+            rawdata = file.read()
+            result = chardet.detect(rawdata)
+            encoding = result['encoding']
+            file.seek(0)  # Reset the file pointer to the beginning
+            try:
+                text = reader_func(file, encoding)
+            except Exception as e:
+                logging.warning(f"Failed to process {file_path} with encoding {encoding}: {e}")
+                text = ''
         return text
 
-    def read_txt(self, file_path: str) -> List[str]:
-        lines = []
-        if file_path == '.':
-            txt_files = [f for f in os.listdir() if os.path.isfile(f) and f.lower().endswith('.txt')]
-            for txt_file in txt_files:
-                with open(txt_file, 'r', encoding='utf-8') as file:
-                    lines += file.readlines()
-        else:
-            with open(file_path, 'r', encoding='utf-8') as file:
-                lines = file.readlines()
-        return [line.strip() for line in lines if line.strip()]
+    def read_pdf(self, file, encoding: str) -> str:
+        text = ''
+        pdf_reader = PdfFileReader(file)
+        for page in range(pdf_reader.getNumPages()):
+            text += pdf_reader.getPage(page).extractText()
+        return text
+
+    def read_txt(self, file, encoding: str) -> List[str]:
+        return [line.strip() for line in file.read().decode(encoding).splitlines() if line.strip()]
 
     def preprocess_text_data(self, text: str) -> str:
         # Tokenization
@@ -70,14 +79,23 @@ def preprocess_text_data(self, text: str) -> str:
         tokenized_text = " ".join(tokens)
         return tokenized_text
 
-    def preprocess_txt_implicit_hate_comments(self, comments: List[str]) -> List[str]:
-        return [re.split(r':', comment, maxsplit=2)[-1] for comment in comments]
-
     def load_data(self, file_path: str) -> List[str]:
+        if not os.path.exists(file_path) and file_path != '.':
+            raise ValueError("File path does not exist.")
+        
         if file_path.lower().endswith('.pdf'):
-            raw_data = self.read_pdf(file_path)
+            raw_data = self.read_file(file_path, self.read_pdf)
         elif file_path.lower().endswith('.txt'):
-            raw_data = self.read_txt(file_path)
+            raw_data = self.read_file(file_path, self.read_txt)
+        elif os.path.isdir(file_path):
+            raw_data = ''
+            for root, _, files in os.walk(file_path):
+                for file_name in files:
+                    full_path = os.path.join(root, file_name)
+                    if full_path.lower().endswith('.pdf'):
+                        raw_data += self.read_pdf(full_path)
+                    elif full_path.lower().endswith('.txt'):
+                        raw_data += self.read_txt(full_path)
         else:
             raise ValueError("Unsupported file format.")
 
@@ -98,4 +116,4 @@ def save_processed_data(self, processed_data: List[str], file_type: str = "csv")
     preprocessor = Preprocessor()
     file_path = os.getenv('INPUT_FILE_PATH') or input("Enter the path to the file or '.' to process all PDF and TXT files in the current directory: ")
     processed_data = preprocessor.load_data(file_path)
-    preprocessor.save_processed_data(processed_data)
\ No newline at end of file
+    preprocessor.save_processed_data(processed_data)
diff --git a/project/modules/CyberSentinel/web-ui b/project/modules/CyberSentinel/web-ui
deleted file mode 160000
index 793a768..0000000
--- a/project/modules/CyberSentinel/web-ui
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 793a768d5390f4af3146cb8fd5504c4d7b4e1511
diff --git a/project/setup/orchestration/docker-compose.yml b/project/setup/orchestration/docker-compose.yml
deleted file mode 100644
index 9d5bedf..0000000
--- a/project/setup/orchestration/docker-compose.yml
+++ /dev/null
@@ -1,44 +0,0 @@
-#outdated as of 8-2-23
-version: '3.9'
-
-services:
-  opendts:
-    build:
-      context: ./
-      dockerfile: Dockerfile
-      target: builder
-    volumes:
-      - ./:/app
-
-  threader:
-    build: 
-      context: ./project/modules/AutoThreader
-      dockerfile: Dockerfile
-      target: threader-${BUILD_TYPE} # Configurate your build type (-dev | -release)
-      args:
-        - BUILD_TYPE=release
-    env_file:
-      - .env # Ensure your variables have been set before running 'Docker-Compose'
-    environment:
-      - THREADS_USERNAME=${THREADS_USERNAME}
-      - THREADS_PASSWORD=${THREADS_PASSWORD}
-      - WEBHOOK_SECRET=${THREADS_WEBHOOK_SECRET}
-    volumes:
-      - ./project/modules/AutoThreader:/app
-      - ./project/modules/AutoThreader/docker-compose.yml:/app/docker-compose.yml:ro
-      - ./project/modules/AutoThreader/Dockerfile:/app/Dockerfile:ro
-
-  auto-gpt:
-    build: 
-      context: ./project/modules/AutoThreader
-      dockerfile: Dockerfile
-      target: auto-gpt-${BUILD_TYPE} # Configurate your build type (-dev | -release)
-      args:
-        - BUILD_TYPE=dev
-    env_file:
-      - .env
-    volumes:
-      - ./project/modules/AutoThreader:/app
-      - ./project/modules/AutoThreader/docker-compose.yml:/app/docker-compose.yml:ro
-      - ./project/modules/AutoThreader/Dockerfile:/app/Dockerfile:ro
-    profiles: ["exclude-from-up"]
diff --git a/project/setup/orchestration/requirements.txt b/project/setup/orchestration/requirements.txt
deleted file mode 100644
index 1dc3f15..0000000
--- a/project/setup/orchestration/requirements.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-scikit-learn
-tensorflow
-pandas
-PyPDF2
-nltk
\ No newline at end of file
diff --git a/project/setup/setup.py b/project/setup/setup.py
deleted file mode 100644
index 120de9c..0000000
--- a/project/setup/setup.py
+++ /dev/null
@@ -1,5 +0,0 @@
-import os
-from project.modules.autogpt import main
-from project.modules.AutoThreader.threads_py import ThreadsAPI
-
-data = "../../documents/Database/CyberSentinel_Training-Data"
\ No newline at end of file
diff --git a/project/tests/not_inprogress.txt b/project/tests/not_inprogress.txt
deleted file mode 100644
index eabb290..0000000
--- a/project/tests/not_inprogress.txt
+++ /dev/null
@@ -1 +0,0 @@
-nope.
\ No newline at end of file
diff --git a/project/setup/pyproject.toml b/pyproject.toml
similarity index 52%
rename from project/setup/pyproject.toml
rename to pyproject.toml
index cfc9aae..a0be77e 100644
--- a/project/setup/pyproject.toml
+++ b/pyproject.toml
@@ -4,20 +4,22 @@ build-backend = "poetry.core.masonry.api"
 
 [tool.poetry]
 name = "OpenDTS"
-version = "1.3.2"
-description = "Description"
+version = "1.5.1"
+description = "Open Domestic Threat Scanner - LLM Threat-Intelligence"
 authors = ["Daethyra dev-daethyra@protonmail.com"]
 
 [tool.poetry.dependencies]
-[tool.poetry.dependencies]
-python = "^3.11.3"
+python = ">=3.11.3,<3.12"
+numpy = ">=1.22,<=1.24.3"
 scikit-learn = "^0.24.2"
-tensorflow = "^2.6.0"
-pandas = "^1.3.3"
-python-dotenv = "^0.19.1"
-PyPDF2 = "^2.3.1"
+tensorflow-io-gcs-filesystem = "^0.31.0"
+tensorflow = "^2.13.0"
+pandas = "^1.5.3"
+python-dotenv = "^0.19.2"
+PyPDF2 = "^2.12.1"
 nltk = "^3.8.1"
 
 [tool.poetry.dev-dependencies]
 
+
 [tool.poetry.scripts]

From 1579c96b409ab1fd75aec5de4b086a5cf5fbe22d Mon Sep 17 00:00:00 2001
From: Daemon <109057945+Daethyra@users.noreply.github.com>
Date: Wed, 16 Aug 2023 21:43:50 -0700
Subject: [PATCH 06/17] 	modified:  
 project/modules/CyberSentinel/preprocess/preprocessing.py

---
 .../CyberSentinel/preprocess/preprocessing.py | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/project/modules/CyberSentinel/preprocess/preprocessing.py b/project/modules/CyberSentinel/preprocess/preprocessing.py
index 9d4ac11..8448c18 100644
--- a/project/modules/CyberSentinel/preprocess/preprocessing.py
+++ b/project/modules/CyberSentinel/preprocess/preprocessing.py
@@ -12,6 +12,9 @@
 import nltk
 from typing import List
 from datetime import datetime
+from urllib.parse import urlparse
+import urllib.request
+import shutil
 
 # Downloading NLTK resources if not already present
 nltk.download('wordnet')
@@ -32,6 +35,22 @@ def __init__(self):
         self.lemmatizer = WordNetLemmatizer()
         self.stop_words = set(stopwords.words('english'))
 
+    def validate_input_path(self, file_path: str) -> str:
+        # Check if the file path is an HTTPS link
+        parsed_url = urlparse(file_path)
+        if parsed_url.scheme == "https":
+            # Download the file to a temporary location
+            temp_file_path = "temp_file"
+            with urllib.request.urlopen(file_path) as response, open(temp_file_path, 'wb') as out_file:
+                shutil.copyfileobj(response, out_file)
+            return temp_file_path
+
+        # Check if the file path exists, if not create the directories
+        if not os.path.exists(file_path) and file_path != '.':
+            os.makedirs(os.path.dirname(file_path), exist_ok=True)
+
+        return file_path
+
     def read_file(self, file_path: str, reader_func) -> str:
         result = ''
         if file_path == '.':
@@ -80,6 +99,7 @@ def preprocess_text_data(self, text: str) -> str:
         return tokenized_text
 
     def load_data(self, file_path: str) -> List[str]:
+        file_path = self.validate_input_path(file_path)
         if not os.path.exists(file_path) and file_path != '.':
             raise ValueError("File path does not exist.")
         

From f804ace5c7f4923ba2416e5e9b39aec6a3dba34b Mon Sep 17 00:00:00 2001
From: Daemon <109057945+Daethyra@users.noreply.github.com>
Date: Wed, 16 Aug 2023 21:47:38 -0700
Subject: [PATCH 07/17] Updated dependabot

---
 .github/dependabot-misc.yml   | 24 ++++++++++++++
 .github/dependabot-python.yml | 24 ++++++++++++++
 .github/dependabot.yml        | 60 -----------------------------------
 3 files changed, 48 insertions(+), 60 deletions(-)
 create mode 100644 .github/dependabot-misc.yml
 create mode 100644 .github/dependabot-python.yml
 delete mode 100644 .github/dependabot.yml

diff --git a/.github/dependabot-misc.yml b/.github/dependabot-misc.yml
new file mode 100644
index 0000000..e4e964e
--- /dev/null
+++ b/.github/dependabot-misc.yml
@@ -0,0 +1,24 @@
+  # Enable version updates for npm
+  - package-ecosystem: "npm"
+    # Look for `package.json` and `lock` files in the `root` directory
+    directory: "/project/modules/"
+    # Check the npm registry for updates every day (weekdays)
+    schedule:
+      interval: "daily"
+      # Check for npm updates at 9am UTC
+      time: "09:00"
+      timezone: "America/Los_Angeles"
+    target-branch: "master"
+    versioning-strategy: auto
+
+  # Enable version updates for Docker
+  - package-ecosystem: "docker"
+    # Look for a `Dockerfile` in the `root` directory
+    directory: "/"
+    # Check for updates once a week
+    schedule:
+      interval: "daily"
+      # Check for npm updates at 9am UTC
+      time: "09:00"
+      timezone: "America/Los_Angeles"
+    target-branch: "master"
\ No newline at end of file
diff --git a/.github/dependabot-python.yml b/.github/dependabot-python.yml
new file mode 100644
index 0000000..1e6da62
--- /dev/null
+++ b/.github/dependabot-python.yml
@@ -0,0 +1,24 @@
+# Please see the documentation for all configuration options:
+# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
+version: 2
+updates:
+  # Enable version updates for multiple branches
+  - package-ecosystem: "pip" 
+    directory: "/"
+    schedule:
+      interval: "daily"
+      # Check for pip updates at 9am UTC
+      time: "09:00"
+      timezone: "America/Los_Angeles"
+    target-branch: "master"
+    versioning-strategy: auto
+    
+  - package-ecosystem: "pip" 
+    directory: "/"
+    schedule:
+      interval: "daily"
+      # Check for pip updates at 9am UTC
+      time: "09:00"
+      timezone: "America/Los_Angeles"
+    target-branch: "v1.5*"
+    versioning-strategy: auto
\ No newline at end of file
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
deleted file mode 100644
index d831a2f..0000000
--- a/.github/dependabot.yml
+++ /dev/null
@@ -1,60 +0,0 @@
-# Please see the documentation for all configuration options:
-# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
-version: 2
-updates:
-  # Enable version updates for multiple branches
-  - package-ecosystem: "pip" 
-    directory: "/"
-    schedule:
-      interval: "daily"
-      # Check for npm updates at 9am UTC
-      time: "09:00"
-      timezone: "America/Los_Angeles"
-    target-branch: "master"
-    versioning-strategy: auto
-    
-  - package-ecosystem: "pip" 
-    directory: "/"
-    schedule:
-      interval: "daily"
-      # Check for npm updates at 9am UTC
-      time: "09:00"
-      timezone: "America/Los_Angeles"
-    target-branch: "v1.5"
-    versioning-strategy: auto
-
-  - package-ecosystem: "pip" 
-    directory: "/"
-    schedule:
-      interval: "daily"
-      # Check for npm updates at 9am UTC
-      time: "09:00"
-      timezone: "America/Los_Angeles"
-    target-branch: "v1.5.1"
-    versioning-strategy: auto
-
-  
-  # Enable version updates for npm
-  - package-ecosystem: "npm"
-    # Look for `package.json` and `lock` files in the `root` directory
-    directory: "/"
-    # Check the npm registry for updates every day (weekdays)
-    schedule:
-      interval: "daily"
-      # Check for npm updates at 9am UTC
-      time: "09:00"
-      timezone: "America/Los_Angeles"
-    target-branch: "master"
-    versioning-strategy: auto
-
-  # Enable version updates for Docker
-  - package-ecosystem: "docker"
-    # Look for a `Dockerfile` in the `root` directory
-    directory: "/"
-    # Check for updates once a week
-    schedule:
-      interval: "daily"
-      # Check for npm updates at 9am UTC
-      time: "09:00"
-      timezone: "America/Los_Angeles"
-    target-branch: "master"

From c4bf5533a114b61cb6a2161a73dc262568789385 Mon Sep 17 00:00:00 2001
From: Daemon <109057945+Daethyra@users.noreply.github.com>
Date: Wed, 16 Aug 2023 21:52:27 -0700
Subject: [PATCH 08/17] 	modified:   .github/workflows/docker-ci.yml 
 modified:   .github/workflows/greetings.yml

---
 .github/workflows/docker-ci.yml | 2 +-
 .github/workflows/greetings.yml | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/docker-ci.yml b/.github/workflows/docker-ci.yml
index 271afa1..f4b7dce 100644
--- a/.github/workflows/docker-ci.yml
+++ b/.github/workflows/docker-ci.yml
@@ -37,7 +37,7 @@ jobs:
       name: Build image
       uses: docker/build-push-action@v3
       with:
-        context: ./project/setup/orchestration/
+        context: ./
         build-args: BUILD_TYPE=${{ matrix.build-type }}
         tags: ${{ env.IMAGE_NAME }}
         load: true    # save to docker images
diff --git a/.github/workflows/greetings.yml b/.github/workflows/greetings.yml
index 4677434..c1dfd39 100644
--- a/.github/workflows/greetings.yml
+++ b/.github/workflows/greetings.yml
@@ -12,5 +12,5 @@ jobs:
     - uses: actions/first-interaction@v1
       with:
         repo-token: ${{ secrets.GITHUB_TOKEN }}
-        issue-message: "Message that will be displayed on users' first issue"
-        pr-message: "Message that will be displayed on users' first pull request"
+        issue-message: "Thanks for chippin' in, choom. We'll have this looked at right away."
+        pr-message: "OK! Big legend! Nice job, can't wait to see your work."

From 7e6580ed4e5a7e68fbb2b023217bdc8d30dedb2b Mon Sep 17 00:00:00 2001
From: Daemon <109057945+Daethyra@users.noreply.github.com>
Date: Wed, 16 Aug 2023 21:57:13 -0700
Subject: [PATCH 09/17] 	modified:   .github/workflows/docker-ci.yml 
 renamed:    Dockerfile -> project/setup/orchestration/Dockerfile 
 renamed:    docker-compose.yml ->
 project/setup/orchestration/docker-compose.yml 	new file:  
 project/setup/orchestration/requirements.txt

---
 .github/workflows/docker-ci.yml                          | 2 +-
 Dockerfile => project/setup/orchestration/Dockerfile     | 0
 .../setup/orchestration/docker-compose.yml               | 0
 project/setup/orchestration/requirements.txt             | 9 +++++++++
 4 files changed, 10 insertions(+), 1 deletion(-)
 rename Dockerfile => project/setup/orchestration/Dockerfile (100%)
 rename docker-compose.yml => project/setup/orchestration/docker-compose.yml (100%)
 create mode 100644 project/setup/orchestration/requirements.txt

diff --git a/.github/workflows/docker-ci.yml b/.github/workflows/docker-ci.yml
index f4b7dce..271afa1 100644
--- a/.github/workflows/docker-ci.yml
+++ b/.github/workflows/docker-ci.yml
@@ -37,7 +37,7 @@ jobs:
       name: Build image
       uses: docker/build-push-action@v3
       with:
-        context: ./
+        context: ./project/setup/orchestration/
         build-args: BUILD_TYPE=${{ matrix.build-type }}
         tags: ${{ env.IMAGE_NAME }}
         load: true    # save to docker images
diff --git a/Dockerfile b/project/setup/orchestration/Dockerfile
similarity index 100%
rename from Dockerfile
rename to project/setup/orchestration/Dockerfile
diff --git a/docker-compose.yml b/project/setup/orchestration/docker-compose.yml
similarity index 100%
rename from docker-compose.yml
rename to project/setup/orchestration/docker-compose.yml
diff --git a/project/setup/orchestration/requirements.txt b/project/setup/orchestration/requirements.txt
new file mode 100644
index 0000000..969b626
--- /dev/null
+++ b/project/setup/orchestration/requirements.txt
@@ -0,0 +1,9 @@
+python>=3.11.3,<3.12
+numpy==1.24.3
+scikit-learn==0.24.2
+tensorflow-io-gcs-filesystem==0.31.0
+tensorflow==2.13.0
+pandas==1.5.3
+python-dotenv==0.19.2
+PyPDF2==2.12.1
+nltk==3.8.1

From cb985588d47afc6a7d5110ef5d6147c68a0b2ecd Mon Sep 17 00:00:00 2001
From: Daemon <109057945+Daethyra@users.noreply.github.com>
Date: Wed, 16 Aug 2023 22:05:03 -0700
Subject: [PATCH 10/17] 	modified:  
 project/setup/orchestration/requirements.txt

---
 project/setup/orchestration/requirements.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/project/setup/orchestration/requirements.txt b/project/setup/orchestration/requirements.txt
index 969b626..2e9aff9 100644
--- a/project/setup/orchestration/requirements.txt
+++ b/project/setup/orchestration/requirements.txt
@@ -1,4 +1,3 @@
-python>=3.11.3,<3.12
 numpy==1.24.3
 scikit-learn==0.24.2
 tensorflow-io-gcs-filesystem==0.31.0

From 77a71e1264946b539b0b3e6d32b0a8f3fb435166 Mon Sep 17 00:00:00 2001
From: Daemon <109057945+Daethyra@users.noreply.github.com>
Date: Wed, 16 Aug 2023 22:08:03 -0700
Subject: [PATCH 11/17] 	modified:  
 project/setup/orchestration/docker-compose.yml

---
 project/setup/orchestration/docker-compose.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/project/setup/orchestration/docker-compose.yml b/project/setup/orchestration/docker-compose.yml
index 60f5913..38e4354 100644
--- a/project/setup/orchestration/docker-compose.yml
+++ b/project/setup/orchestration/docker-compose.yml
@@ -3,8 +3,8 @@ version: '3.9'
 services:
   opendts:
     build:
-      context: ./
+      context: ./project/setup/orchestration
       dockerfile: Dockerfile
       target: builder
     volumes:
-      - ./:/app
\ No newline at end of file
+      - ./:/app

From 8b41a95e06f64355173a529cf389ac026b2c76d9 Mon Sep 17 00:00:00 2001
From: Daemon <109057945+Daethyra@users.noreply.github.com>
Date: Thu, 17 Aug 2023 20:27:09 -0700
Subject: [PATCH 12/17] 	deleted:    .github/workflows/.gitignore 
 deleted:    documents/todo.txt 	modified:  
 project/modules/CyberSentinel/.env.template 	modified:  
 project/modules/CyberSentinel/preprocess/preprocessing.py 	new file:  
 project/modules/CyberSentinel/utilities/logging.py 	modified:  
 project/setup/orchestration/requirements.txt 	modified:   pyproject.toml

---
 .github/workflows/.gitignore                  |  4 ---
 documents/todo.txt                            | 35 -------------------
 project/modules/CyberSentinel/.env.template   |  2 ++
 .../CyberSentinel/preprocess/preprocessing.py | 16 +++++----
 .../CyberSentinel/utilities/logging.py        | 10 ++++++
 project/setup/orchestration/requirements.txt  | 18 +++++-----
 pyproject.toml                                | 20 ++++++-----
 7 files changed, 44 insertions(+), 61 deletions(-)
 delete mode 100644 .github/workflows/.gitignore
 delete mode 100644 documents/todo.txt
 create mode 100644 project/modules/CyberSentinel/utilities/logging.py

diff --git a/.github/workflows/.gitignore b/.github/workflows/.gitignore
deleted file mode 100644
index 5bde4d8..0000000
--- a/.github/workflows/.gitignore
+++ /dev/null
@@ -1,4 +0,0 @@
-venv
-.env
-__pycache__
-logs*
\ No newline at end of file
diff --git a/documents/todo.txt b/documents/todo.txt
deleted file mode 100644
index 2d7fcfd..0000000
--- a/documents/todo.txt
+++ /dev/null
@@ -1,35 +0,0 @@
-1. combine multi-hooker and multi extension converter
-
-2. automate the creation and storage of Ada002's embeddings
-
-3. create langchain, Chain, for dynamic Chain generation from user
-# I am creating a natural language processor that helps people understand their enterprise's threat-landscape by providing step by step guidance for vulnerability reconaissance and remediation. 
-3a. Create Chain to exclude certain groups, only include certain groups
-Only right-wing extremist groups are eligible for data processing.
-
-4. Implement Chain for cybersecurity tools like https://nmap.org/download, https://www.whois.com/whois, 
-
-5. train binary classification model (intended-violence)
-a. Collect better training data. Research hateful comments online that imply the intention to commit violence. Then, find examples that are sarcastic to train against creating False Positives further down the line
-b. complete preprocessor: requires[sanitization, cleaning, standardization, tokenization, splitting,]
-c. revamp or upgrade embeddings.py module
-d. revamp or upgrade model_creator.py module
-e. test and evaluate CyberSentinel using techniques I learned from Google courses(https://developers.google.com/machine-learning/crash-course/classification/video-lecture)
-
-6(2a). create similarity query functionality against Pinecone index
-6a(2b). create langchain for queries
-
-7. Integrate CyberSentinel with OpenDTS' langchain features
-# Crucial questions:
-- What will orchestrate the langchain Chain calls? What LLM? Or, just how!?
-- How will the model be used? In what circumstances is it appropriate for the Master Agent to decide to run something against the CyberSentinel classification model? In what circumstances should I use similarity queries against Pinecone?
-
-8. Study AutoGPT's web browser modules; see how adaptable/useful it'd be for this project
-
-
-0x1. Update breakdown.md file
-0x2. Update readme.md file
-000x1. Create Dockerfile
-000x2. Create compose.yml
-000x3. Automate SetUp process by writing system-level commands in setup.py
-000x3a. AND provide detail manual installation guidance in the ReadMe.md
\ No newline at end of file
diff --git a/project/modules/CyberSentinel/.env.template b/project/modules/CyberSentinel/.env.template
index fd923b4..4389573 100644
--- a/project/modules/CyberSentinel/.env.template
+++ b/project/modules/CyberSentinel/.env.template
@@ -1,3 +1,5 @@
+# Create a copy of this file and name it '.env'
+
 # Model Training Configuration
 TRAINING_DATA_PATH=project/modules/CyberSentinel/training-data/
 LEARNING_RATE=0.001
diff --git a/project/modules/CyberSentinel/preprocess/preprocessing.py b/project/modules/CyberSentinel/preprocess/preprocessing.py
index 8448c18..e16207c 100644
--- a/project/modules/CyberSentinel/preprocess/preprocessing.py
+++ b/project/modules/CyberSentinel/preprocess/preprocessing.py
@@ -6,6 +6,7 @@
 import csv
 import chardet
 import logging
+from ..utilities.logging import *
 from PyPDF2 import PdfFileReader
 from nltk.stem import WordNetLemmatizer
 from nltk.corpus import stopwords
@@ -21,13 +22,11 @@
 nltk.download('stopwords')
 
 # Point to the location of the .env file relative to the script's location
-env_path = os.path.join(os.path.dirname(__file__), '../../../.env')
+env_path = os.path.join(os.path.dirname(__file__), '../.env')
 
 # Load the .env file
 load_dotenv(dotenv_path=env_path)
 
-logging.basicConfig(filename='preprocessing_%Y%m%d_%H%M%S.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-
 class Preprocessor:
     def __init__(self):
         self.input_file_path = os.getenv('INPUT_FILE_PATH')
@@ -35,7 +34,10 @@ def __init__(self):
         self.lemmatizer = WordNetLemmatizer()
         self.stop_words = set(stopwords.words('english'))
 
-    def validate_input_path(self, file_path: str) -> str:
+    def validate_input_path(self, file_path: str = None) -> str:
+        if file_path is None:
+            file_path = '../training-data'  # Default path to target files in the training-data directory
+
         # Check if the file path is an HTTPS link
         parsed_url = urlparse(file_path)
         if parsed_url.scheme == "https":
@@ -75,11 +77,13 @@ def read_with_detected_encoding(self, file_path: str, reader_func) -> str:
                 text = ''
         return text
 
-    def read_pdf(self, file, encoding: str) -> str:
+    def read_pdf(self, file) -> str:
         text = ''
         pdf_reader = PdfFileReader(file)
         for page in range(pdf_reader.getNumPages()):
-            text += pdf_reader.getPage(page).extractText()
+            page_text = pdf_reader.getPage(page).extractText()
+            encoding = chardet.detect(page_text.encode())['encoding']
+            text += page_text.decode(encoding)
         return text
 
     def read_txt(self, file, encoding: str) -> List[str]:
diff --git a/project/modules/CyberSentinel/utilities/logging.py b/project/modules/CyberSentinel/utilities/logging.py
new file mode 100644
index 0000000..0c98a81
--- /dev/null
+++ b/project/modules/CyberSentinel/utilities/logging.py
@@ -0,0 +1,10 @@
+import logging
+from datetime import datetime
+
+# Format the current datetime
+current_time = datetime.now().strftime("%d%m%Y_%H%M%S")
+
+# Concatenate the datetime with the log filename
+log_filename = f'preprocessing{current_time}.log'
+
+logging.basicConfig(filename=log_filename, level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
diff --git a/project/setup/orchestration/requirements.txt b/project/setup/orchestration/requirements.txt
index 2e9aff9..9ea8540 100644
--- a/project/setup/orchestration/requirements.txt
+++ b/project/setup/orchestration/requirements.txt
@@ -1,8 +1,10 @@
-numpy==1.24.3
-scikit-learn==0.24.2
-tensorflow-io-gcs-filesystem==0.31.0
-tensorflow==2.13.0
-pandas==1.5.3
-python-dotenv==0.19.2
-PyPDF2==2.12.1
-nltk==3.8.1
+cython
+numpy
+chardet
+scikit-learn
+tensorflow-io-gcs-filesystem
+tensorflow
+pandas
+python-dotenv
+PyPDF2
+nltk
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index a0be77e..9aa39b6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,5 +1,5 @@
 [build-system]
-requires = ["poetry-core>=1.0.0"]
+requires = ["poetry-core>=1.6.1"]
 build-backend = "poetry.core.masonry.api"
 
 [tool.poetry]
@@ -9,14 +9,18 @@ description = "Open Domestic Threat Scanner - LLM Threat-Intelligence"
 authors = ["Daethyra dev-daethyra@protonmail.com"]
 
 [tool.poetry.dependencies]
-python = ">=3.11.3,<3.12"
-numpy = ">=1.22,<=1.24.3"
-scikit-learn = "^0.24.2"
-tensorflow-io-gcs-filesystem = "^0.31.0"
+python = ">3.9,<3.12.0"
+wheel = "^0.41.1"
+chardet = "^5.2.0"
+setuptools = "^60.0"
+numpy = "^1.25.2"
+cython = "^3.0.0"
+scikit-learn = "^1.3.0"
+tensorflow-io-gcs-filesystem = "^0.33.0"
 tensorflow = "^2.13.0"
-pandas = "^1.5.3"
-python-dotenv = "^0.19.2"
-PyPDF2 = "^2.12.1"
+pandas = "^2.0.3"
+python-dotenv = "^1.0.0"
+PyPDF2 = "^3.15.0"
 nltk = "^3.8.1"
 
 [tool.poetry.dev-dependencies]

From 1a34c99764e0c952c7c8af4a1dc4ceabe5995076 Mon Sep 17 00:00:00 2001
From: Daemon <109057945+Daethyra@users.noreply.github.com>
Date: Thu, 17 Aug 2023 20:35:22 -0700
Subject: [PATCH 13/17] 	modified:  
 project/modules/CyberSentinel/preprocess/preprocessing.py

---
 .../CyberSentinel/preprocess/preprocessing.py | 26 ++++++++++---------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/project/modules/CyberSentinel/preprocess/preprocessing.py b/project/modules/CyberSentinel/preprocess/preprocessing.py
index e16207c..626ee3a 100644
--- a/project/modules/CyberSentinel/preprocess/preprocessing.py
+++ b/project/modules/CyberSentinel/preprocess/preprocessing.py
@@ -1,21 +1,23 @@
 """ Defines functions for ingesting files, lemmatizes and removing stop words, and tokenization. """
 
-from dotenv import load_dotenv
+import csv
 import os
 import re
-import csv
-import chardet
-import logging
-from ..utilities.logging import *
-from PyPDF2 import PdfFileReader
-from nltk.stem import WordNetLemmatizer
-from nltk.corpus import stopwords
-import nltk
-from typing import List
+import shutil
+import urllib.request
 from datetime import datetime
+from typing import List
 from urllib.parse import urlparse
-import urllib.request
-import shutil
+
+import chardet
+import nltk
+from dotenv import load_dotenv
+from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
+from PyPDF2 import PdfFileReader
+
+from utilities.logging import *
+
 
 # Downloading NLTK resources if not already present
 nltk.download('wordnet')

From 12d7bd05952f998992fe2d4e1c8523c8ac62ae4f Mon Sep 17 00:00:00 2001
From: Daemon <109057945+Daethyra@users.noreply.github.com>
Date: Fri, 18 Aug 2023 17:55:59 -0700
Subject: [PATCH 14/17] Moved orchestration folder to the modules directory

---
 project/{setup => modules}/orchestration/Dockerfile         | 0
 project/{setup => modules}/orchestration/docker-compose.yml | 0
 project/{setup => modules}/orchestration/requirements.txt   | 0
 3 files changed, 0 insertions(+), 0 deletions(-)
 rename project/{setup => modules}/orchestration/Dockerfile (100%)
 rename project/{setup => modules}/orchestration/docker-compose.yml (100%)
 rename project/{setup => modules}/orchestration/requirements.txt (100%)

diff --git a/project/setup/orchestration/Dockerfile b/project/modules/orchestration/Dockerfile
similarity index 100%
rename from project/setup/orchestration/Dockerfile
rename to project/modules/orchestration/Dockerfile
diff --git a/project/setup/orchestration/docker-compose.yml b/project/modules/orchestration/docker-compose.yml
similarity index 100%
rename from project/setup/orchestration/docker-compose.yml
rename to project/modules/orchestration/docker-compose.yml
diff --git a/project/setup/orchestration/requirements.txt b/project/modules/orchestration/requirements.txt
similarity index 100%
rename from project/setup/orchestration/requirements.txt
rename to project/modules/orchestration/requirements.txt

From cfdbfaf14efd0e417ead74c366f6956c808301b5 Mon Sep 17 00:00:00 2001
From: Daemon <109057945+Daethyra@users.noreply.github.com>
Date: Fri, 18 Aug 2023 21:37:04 -0700
Subject: [PATCH 15/17] - Moved documents folder to the project directory. -
 Updated dependabot files

---
 .github/dependabot-misc.yml                                   | 4 ++--
 .github/dependabot-python.yml                                 | 4 ++--
 {documents => project/documents}/LICENSE                      | 0
 .../supplementary-information/Bad-Words/ListOfDirtyWords.txt  | 0
 .../Bad-Words/VIOLENT_TERRORIST_WORDS.txt                     | 0
 .../supplementary-information/Bad-Words/bad_Words_list.txt    | 0
 .../supplementary-information/Bad-Words/badwords.txt          | 0
 .../supplementary-information/Bad-Words/citations.md          | 0
 .../supplementary-information/Bad-Words/cmu-bad-words.txt     | 0
 .../Conservative News Domains/CND_s1.txt                      | 0
 .../documents}/supplementary-information/explanation.txt      | 0
 11 files changed, 4 insertions(+), 4 deletions(-)
 rename {documents => project/documents}/LICENSE (100%)
 rename {documents => project/documents}/supplementary-information/Bad-Words/ListOfDirtyWords.txt (100%)
 rename {documents => project/documents}/supplementary-information/Bad-Words/VIOLENT_TERRORIST_WORDS.txt (100%)
 rename {documents => project/documents}/supplementary-information/Bad-Words/bad_Words_list.txt (100%)
 rename {documents => project/documents}/supplementary-information/Bad-Words/badwords.txt (100%)
 rename {documents => project/documents}/supplementary-information/Bad-Words/citations.md (100%)
 rename {documents => project/documents}/supplementary-information/Bad-Words/cmu-bad-words.txt (100%)
 rename {documents => project/documents}/supplementary-information/Conservative News Domains/CND_s1.txt (100%)
 rename {documents => project/documents}/supplementary-information/explanation.txt (100%)

diff --git a/.github/dependabot-misc.yml b/.github/dependabot-misc.yml
index e4e964e..f14bbfe 100644
--- a/.github/dependabot-misc.yml
+++ b/.github/dependabot-misc.yml
@@ -1,7 +1,7 @@
   # Enable version updates for npm
   - package-ecosystem: "npm"
     # Look for `package.json` and `lock` files in the `root` directory
-    directory: "/project/modules/"
+    directory: "/project/modules/web-ui" # MUST BE UPDATED UPON TYPESCRIPT PROGRAMMING
     # Check the npm registry for updates every day (weekdays)
     schedule:
       interval: "daily"
@@ -14,7 +14,7 @@
   # Enable version updates for Docker
   - package-ecosystem: "docker"
     # Look for a `Dockerfile` in the `root` directory
-    directory: "/"
+    directory: "/project/modules/orchestration"
     # Check for updates once a week
     schedule:
       interval: "daily"
diff --git a/.github/dependabot-python.yml b/.github/dependabot-python.yml
index 1e6da62..3107818 100644
--- a/.github/dependabot-python.yml
+++ b/.github/dependabot-python.yml
@@ -4,7 +4,7 @@ version: 2
 updates:
   # Enable version updates for multiple branches
   - package-ecosystem: "pip" 
-    directory: "/"
+    directory: "project/modules/orchestration"
     schedule:
       interval: "daily"
       # Check for pip updates at 9am UTC
@@ -14,7 +14,7 @@ updates:
     versioning-strategy: auto
     
   - package-ecosystem: "pip" 
-    directory: "/"
+    directory: "project/modules/orchestration"
     schedule:
       interval: "daily"
       # Check for pip updates at 9am UTC
diff --git a/documents/LICENSE b/project/documents/LICENSE
similarity index 100%
rename from documents/LICENSE
rename to project/documents/LICENSE
diff --git a/documents/supplementary-information/Bad-Words/ListOfDirtyWords.txt b/project/documents/supplementary-information/Bad-Words/ListOfDirtyWords.txt
similarity index 100%
rename from documents/supplementary-information/Bad-Words/ListOfDirtyWords.txt
rename to project/documents/supplementary-information/Bad-Words/ListOfDirtyWords.txt
diff --git a/documents/supplementary-information/Bad-Words/VIOLENT_TERRORIST_WORDS.txt b/project/documents/supplementary-information/Bad-Words/VIOLENT_TERRORIST_WORDS.txt
similarity index 100%
rename from documents/supplementary-information/Bad-Words/VIOLENT_TERRORIST_WORDS.txt
rename to project/documents/supplementary-information/Bad-Words/VIOLENT_TERRORIST_WORDS.txt
diff --git a/documents/supplementary-information/Bad-Words/bad_Words_list.txt b/project/documents/supplementary-information/Bad-Words/bad_Words_list.txt
similarity index 100%
rename from documents/supplementary-information/Bad-Words/bad_Words_list.txt
rename to project/documents/supplementary-information/Bad-Words/bad_Words_list.txt
diff --git a/documents/supplementary-information/Bad-Words/badwords.txt b/project/documents/supplementary-information/Bad-Words/badwords.txt
similarity index 100%
rename from documents/supplementary-information/Bad-Words/badwords.txt
rename to project/documents/supplementary-information/Bad-Words/badwords.txt
diff --git a/documents/supplementary-information/Bad-Words/citations.md b/project/documents/supplementary-information/Bad-Words/citations.md
similarity index 100%
rename from documents/supplementary-information/Bad-Words/citations.md
rename to project/documents/supplementary-information/Bad-Words/citations.md
diff --git a/documents/supplementary-information/Bad-Words/cmu-bad-words.txt b/project/documents/supplementary-information/Bad-Words/cmu-bad-words.txt
similarity index 100%
rename from documents/supplementary-information/Bad-Words/cmu-bad-words.txt
rename to project/documents/supplementary-information/Bad-Words/cmu-bad-words.txt
diff --git a/documents/supplementary-information/Conservative News Domains/CND_s1.txt b/project/documents/supplementary-information/Conservative News Domains/CND_s1.txt
similarity index 100%
rename from documents/supplementary-information/Conservative News Domains/CND_s1.txt
rename to project/documents/supplementary-information/Conservative News Domains/CND_s1.txt
diff --git a/documents/supplementary-information/explanation.txt b/project/documents/supplementary-information/explanation.txt
similarity index 100%
rename from documents/supplementary-information/explanation.txt
rename to project/documents/supplementary-information/explanation.txt

From 6b5e485421ccb7b359d8f671453730de00676491 Mon Sep 17 00:00:00 2001
From: Daemon <109057945+Daethyra@users.noreply.github.com>
Date: Fri, 18 Aug 2023 22:13:43 -0700
Subject: [PATCH 16/17] 	Fixed:   pyproject.toml

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 9aa39b6..8ae705a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,10 +17,10 @@ numpy = "^1.25.2"
 cython = "^3.0.0"
 scikit-learn = "^1.3.0"
 tensorflow-io-gcs-filesystem = "^0.33.0"
-tensorflow = "^2.13.0"
+tensorflow = "^2.14.0rc0"
 pandas = "^2.0.3"
 python-dotenv = "^1.0.0"
-PyPDF2 = "^3.15.0"
+PyPDF2 = "^2.12.1"
 nltk = "^3.8.1"
 
 [tool.poetry.dev-dependencies]

From d24d029b887c49ff10dbd455abb0945bdc8647ec Mon Sep 17 00:00:00 2001
From: Daemon <109057945+Daethyra@users.noreply.github.com>
Date: Sat, 19 Aug 2023 08:18:09 -0700
Subject: [PATCH 17/17] Update readme.md

Removed old dev thoughts
---
 readme.md | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/readme.md b/readme.md
index 9d250a8..08a4ee1 100644
--- a/readme.md
+++ b/readme.md
@@ -4,15 +4,4 @@ TLDR: This project uses a multi-layered approach to predict whether or not someo
 
 OpenDTS (Domestic Threat Scanner) is a project intended to provide analytical insight into hostile sentiment towards a given demographic experiencing discrimination.
 
-Users may create profiles to help them track sentiment towards one group.
-
----
-
-## Developer's Thoughts
-
-The application will do its best to accurately discern true intent to harm versus sarcasm, frustration without the intent to harm, and will make key decisions on multiple different layers comprised of entire models. Like the Ada-002 from OpenAI, for example.
-
-Ideally, the first actual deployment will be a heatmap serving real-time processing data for whatever I choose.
-
-- OpenDTS is the manifestation of my attempt to help others protect themselves
-- I want it to be a comprehensive open-source cyberspace threat-intelligence platform
+Users may create profiles to help them track sentiment towards one group.
\ No newline at end of file