From 22f04ed8de376640909bf9b9a11762ff4aea82c4 Mon Sep 17 00:00:00 2001
From: "yuliia.t" <yuliia.t@samsung.com>
Date: Mon, 5 Feb 2024 20:53:27 +0200
Subject: [PATCH 1/3] Create script for adding and downloading new repo for
 labeling.

---
 dataset_extension/__init__.py |  0
 dataset_extension/main.py     | 87 +++++++++++++++++++++++++++++++++++
 2 files changed, 87 insertions(+)
 create mode 100644 dataset_extension/__init__.py
 create mode 100644 dataset_extension/main.py

diff --git a/dataset_extension/__init__.py b/dataset_extension/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/dataset_extension/main.py b/dataset_extension/main.py
new file mode 100644
index 000000000..4e544002d
--- /dev/null
+++ b/dataset_extension/main.py
@@ -0,0 +1,87 @@
+import base64
+import random
+import string
+import subprocess
+from argparse import ArgumentParser
+import logging
+import pathlib
+
+import yaml
+
+logging.basicConfig(
+    format="%(asctime)s | %(levelname)s | %(filename)s:%(lineno)s | %(message)s",
+    level="INFO")
+logger = logging.getLogger(__file__)
+
+current_path = pathlib.Path(__file__).parent.absolute()
+project_path = current_path.parent.absolute()
+
+temp_dir = current_path / 'tmp'
+
+
+def load_ids():
+    snapshot_file = "snapshot.yaml"
+    with open(project_path / snapshot_file, encoding="utf-8") as f:
+        snapshot_data = yaml.load(f, Loader=yaml.FullLoader)
+        return [data['id'] for data in snapshot_data]
+
+
+existing_ids = load_ids()
+
+
+def save_to_yaml(repos_dict, yaml_file):
+    pathlib.Path(yaml_file).parent.mkdir(parents=True, exist_ok=True)
+    with open(yaml_file, "w", encoding="utf-8") as f:
+        yaml.dump(repos_dict, f)
+
+
+def generate_unique_id():
+    while True:
+        unique_id = ''.join(random.choice(string.printable) for _ in range(6))
+        encoded_id = base64.b64encode(unique_id.encode()).decode('utf-8')
+        if encoded_id not in existing_ids:
+            existing_ids.append(encoded_id)
+            return encoded_id
+
+
+def download(repo_url):
+    logger.info(f"Download {repo_url}")
+    repo_url = repo_url.strip()
+    ownername, reponame = repo_url.split("/")[-2:]
+    reponame = reponame.split(".")[0]
+    pathlib.Path(f"{temp_dir}/{ownername}").mkdir(parents=True, exist_ok=True)
+    try:
+        subprocess.check_call(['git', 'clone', repo_url, f"{temp_dir}/{ownername}/{reponame}"])
+        commit_sha = (subprocess.check_output(['git', '-C', f'{temp_dir}/{ownername}/{reponame}', 'rev-parse', 'HEAD'])
+                      .decode('ascii').strip())
+        try:
+            tag = subprocess.check_output(['git', '-C', f'{temp_dir}/{ownername}/{reponame}', 'describe', '--long',
+                                           '--dirty', '--tags']).decode('ascii').strip()
+        except subprocess.CalledProcessError:
+            tag = None
+        id = generate_unique_id()
+        logger.info(f"Downloaded {repo_url} {commit_sha}")
+        return {'id': id, 'url': repo_url, 'sha': commit_sha, 'tag': tag}
+    except subprocess.CalledProcessError as e:
+        logger.error(f"Couldn't download repo {temp_dir}/{ownername}/{reponame}. {e}")
+
+
+def download_repos(input_repo_file):
+    with open(input_repo_file, 'r', encoding='utf-8') as file:
+        lines = file.readlines()
+    downloaded_repos = []
+    for line in lines:
+        repo = download(line)
+        if repo:
+            downloaded_repos.append(repo)
+    return downloaded_repos
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument("--input_repo_file", dest="input_repo_file", required=True,
+                        help="File with list of GitHub repos to be analyzed")
+    args = parser.parse_args()
+    logger.info("Start download")
+    repos = download_repos(args.input_repo_file)
+    save_to_yaml(repos, current_path / 'result' / 'repos.yaml')

From 6ecf52847d6ba45f3e86cf703ffb33ebf493d179 Mon Sep 17 00:00:00 2001
From: "yuliia.t" <yuliia.t@samsung.com>
Date: Mon, 5 Feb 2024 21:38:45 +0200
Subject: [PATCH 2/3] Change tag value

---
 dataset_extension/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dataset_extension/main.py b/dataset_extension/main.py
index 4e544002d..40869cdb3 100644
--- a/dataset_extension/main.py
+++ b/dataset_extension/main.py
@@ -58,7 +58,7 @@ def download(repo_url):
             tag = subprocess.check_output(['git', '-C', f'{temp_dir}/{ownername}/{reponame}', 'describe', '--long',
                                            '--dirty', '--tags']).decode('ascii').strip()
         except subprocess.CalledProcessError:
-            tag = None
+            tag = 'None'
         id = generate_unique_id()
         logger.info(f"Downloaded {repo_url} {commit_sha}")
         return {'id': id, 'url': repo_url, 'sha': commit_sha, 'tag': tag}

From ae7444a29e998072c7c7d9a2eea4cad1285606c5 Mon Sep 17 00:00:00 2001
From: "yuliia.t" <yuliia.t@samsung.com>
Date: Wed, 7 Feb 2024 09:49:31 +0200
Subject: [PATCH 3/3] Add run scanners process. Refactoring

---
 dataset_extension/main.py                     | 128 ++++++++++++++----
 .../reqiurements_dataset_extension.txt        |   3 +
 2 files changed, 101 insertions(+), 30 deletions(-)
 create mode 100644 dataset_extension/reqiurements_dataset_extension.txt

diff --git a/dataset_extension/main.py b/dataset_extension/main.py
index 40869cdb3..3ec63e1b0 100644
--- a/dataset_extension/main.py
+++ b/dataset_extension/main.py
@@ -1,27 +1,32 @@
 import base64
+import hashlib
+import os
 import random
+import shutil
 import string
 import subprocess
-from argparse import ArgumentParser
 import logging
 import pathlib
-
 import yaml
 
+from argparse import ArgumentParser
+from download_data import get_file_type
+
 logging.basicConfig(
-    format="%(asctime)s | %(levelname)s | %(filename)s:%(lineno)s | %(message)s",
-    level="INFO")
+    format='%(asctime)s | %(levelname)s | %(filename)s:%(lineno)s | %(message)s',
+    level='INFO')
 logger = logging.getLogger(__file__)
 
 current_path = pathlib.Path(__file__).parent.absolute()
 project_path = current_path.parent.absolute()
-
-temp_dir = current_path / 'tmp'
+result_path = current_path / 'result'
+temp_path = result_path / 'tmp'
+data_path = result_path / 'data'
+scan_result_path = result_path / 'scan_result'
 
 
 def load_ids():
-    snapshot_file = "snapshot.yaml"
-    with open(project_path / snapshot_file, encoding="utf-8") as f:
+    with open(project_path / 'snapshot.yaml', encoding='utf-8') as f:
         snapshot_data = yaml.load(f, Loader=yaml.FullLoader)
         return [data['id'] for data in snapshot_data]
 
@@ -31,10 +36,21 @@ def load_ids():
 
 def save_to_yaml(repos_dict, yaml_file):
     pathlib.Path(yaml_file).parent.mkdir(parents=True, exist_ok=True)
-    with open(yaml_file, "w", encoding="utf-8") as f:
+    with open(yaml_file, 'w', encoding='utf-8') as f:
         yaml.dump(repos_dict, f)
 
 
+def load_from_yaml(yaml_file):
+    with open(yaml_file, encoding='utf-8') as f:
+        return yaml.load(f, Loader=yaml.FullLoader)
+
+
+def get_owner_repo_name_from_url(url):
+    owner_name, repo_name = url.split('/')[-2:]
+    repo_name = repo_name.split('.')[0]
+    return owner_name, repo_name
+
+
 def generate_unique_id():
     while True:
         unique_id = ''.join(random.choice(string.printable) for _ in range(6))
@@ -44,44 +60,96 @@ def generate_unique_id():
             return encoded_id
 
 
-def download(repo_url):
-    logger.info(f"Download {repo_url}")
+def download_repo(repo_url, base_path):
+    logger.info(f'Download {repo_url}')
     repo_url = repo_url.strip()
-    ownername, reponame = repo_url.split("/")[-2:]
-    reponame = reponame.split(".")[0]
-    pathlib.Path(f"{temp_dir}/{ownername}").mkdir(parents=True, exist_ok=True)
+    owner_name, repo_name = get_owner_repo_name_from_url(repo_url)
+    pathlib.Path(f'{base_path}/{owner_name}').mkdir(parents=True, exist_ok=True)
+    repo_path = f'{base_path}/{owner_name}/{repo_name}'
     try:
-        subprocess.check_call(['git', 'clone', repo_url, f"{temp_dir}/{ownername}/{reponame}"])
-        commit_sha = (subprocess.check_output(['git', '-C', f'{temp_dir}/{ownername}/{reponame}', 'rev-parse', 'HEAD'])
-                      .decode('ascii').strip())
+        subprocess.check_call(['git', 'clone', repo_url, repo_path])
+        commit_sha = (subprocess.check_output(['git', '-C', repo_path, 'rev-parse', 'HEAD']).decode('ascii').strip())
         try:
-            tag = subprocess.check_output(['git', '-C', f'{temp_dir}/{ownername}/{reponame}', 'describe', '--long',
-                                           '--dirty', '--tags']).decode('ascii').strip()
+            tag = (subprocess.check_output(['git', '-C', repo_path, 'describe', '--long', '--dirty', '--tags'])
+                   .decode('ascii').strip())
         except subprocess.CalledProcessError:
             tag = 'None'
-        id = generate_unique_id()
-        logger.info(f"Downloaded {repo_url} {commit_sha}")
-        return {'id': id, 'url': repo_url, 'sha': commit_sha, 'tag': tag}
+        logger.info(f'Downloaded {repo_url} {commit_sha}')
+        return {'id': generate_unique_id(), 'url': repo_url, 'sha': commit_sha, 'tag': tag}
     except subprocess.CalledProcessError as e:
-        logger.error(f"Couldn't download repo {temp_dir}/{ownername}/{reponame}. {e}")
+        logger.error(f"Couldn't download repo {repo_path}. {e}")
 
 
-def download_repos(input_repo_file):
+def download_repos(input_repo_file, dst_path):
     with open(input_repo_file, 'r', encoding='utf-8') as file:
-        lines = file.readlines()
+        urls = file.readlines()
     downloaded_repos = []
-    for line in lines:
-        repo = download(line)
+    for url in urls:
+        repo = download_repo(url, dst_path)
         if repo:
             downloaded_repos.append(repo)
     return downloaded_repos
 
 
+def hashing_file_names(src_path, dst_path, repos_info):
+    os.makedirs(dst_path, exist_ok=True)
+    for i, repo_data in enumerate(repos_info):
+        new_repo_id = hashlib.sha256(repo_data['id'].encode()).hexdigest()[:8]
+        logger.info(f'Hash of repo {repo_data["id"]} = {new_repo_id}')
+        owner_name, repo_name = get_owner_repo_name_from_url(repo_data['url'])
+        repo_path = f'{src_path}/{owner_name}/{repo_name}'
+        # Select all files in the repo
+        repo_files = [os.path.join(root, file) for root, dirs, files in os.walk(repo_path) for file in files]
+        # Copy files to new dataset location
+        for j, full_path in enumerate(sorted(list(repo_files))):
+            short_path = os.path.relpath(full_path, repo_path).replace('\\', '/')
+            _, file_extension = os.path.splitext(full_path)
+            file_type = get_file_type(short_path, file_extension)
+            file_id = hashlib.sha256(short_path.encode()).hexdigest()[:8]
+
+            file_dst_dir = f'{dst_path}/{new_repo_id}/{file_type}'
+            os.makedirs(file_dst_dir, exist_ok=True)
+            file_dst_full_path = f'{file_dst_dir}/{file_id}{file_extension}'
+            shutil.copy(full_path, file_dst_full_path)
+            logger.info('COPIED FILE: %s -> %s', full_path, file_dst_full_path)
+
+
+def run_credsweeper(data_dir, result_dir):
+    pathlib.Path(result_dir).mkdir(parents=True, exist_ok=True)
+    for repo in os.listdir(data_dir):
+        logger.info(f'Running CredSweeper on {repo}')
+        repo_path = data_dir / repo
+        try:
+            subprocess.check_call(['credsweeper', '--path', repo_path, '--save-json', f'{repo}.json'])
+        except subprocess.CalledProcessError as e:
+            logger.error(f"Couldn't run credsweeper for repo {repo}. {e}")
+
+
+def run_detect_secrets(data_dir, result_dir):
+    pathlib.Path(result_dir).mkdir(parents=True, exist_ok=True)
+    for repo in os.listdir(data_dir):
+        logger.info(f'Running DetectSecrets on {repo}')
+        try:
+            out = (subprocess.check_output(['detect-secrets', '-C', f'{data_dir}/{repo}/', 'scan', '--all-files'])
+                   .decode())
+            with open(f'{result_dir}/{repo}.baseline', 'w') as f:
+                f.write(out)
+        except subprocess.CalledProcessError as e:
+            logger.error(f"Couldn't run detect-secrets for repo {data_dir}/{repo}. {e}")
+
+
 if __name__ == "__main__":
     parser = ArgumentParser()
     parser.add_argument("--input_repo_file", dest="input_repo_file", required=True,
                         help="File with list of GitHub repos to be analyzed")
     args = parser.parse_args()
-    logger.info("Start download")
-    repos = download_repos(args.input_repo_file)
-    save_to_yaml(repos, current_path / 'result' / 'repos.yaml')
+    logger.info('Start download')
+    repos = download_repos(args.input_repo_file, temp_path)
+
+    save_to_yaml(repos, result_path / 'repos.yaml')
+    # repos = load_from_yaml(result_path / 'repos.yaml')
+
+    hashing_file_names(temp_path, data_path, repos)
+
+    run_detect_secrets(data_path, scan_result_path / 'detect_secrets')
+    run_credsweeper(data_path, scan_result_path / 'credsweeper')
diff --git a/dataset_extension/reqiurements_dataset_extension.txt b/dataset_extension/reqiurements_dataset_extension.txt
new file mode 100644
index 000000000..f57dad4f2
--- /dev/null
+++ b/dataset_extension/reqiurements_dataset_extension.txt
@@ -0,0 +1,3 @@
+detect-secrets
+credentialdigger
+credsweeper
\ No newline at end of file