From 22f04ed8de376640909bf9b9a11762ff4aea82c4 Mon Sep 17 00:00:00 2001 From: "yuliia.t" Date: Mon, 5 Feb 2024 20:53:27 +0200 Subject: [PATCH 1/3] Create script for adding and downloading new repo for labeling. --- dataset_extension/__init__.py | 0 dataset_extension/main.py | 87 +++++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+) create mode 100644 dataset_extension/__init__.py create mode 100644 dataset_extension/main.py diff --git a/dataset_extension/__init__.py b/dataset_extension/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/dataset_extension/main.py b/dataset_extension/main.py new file mode 100644 index 000000000..4e544002d --- /dev/null +++ b/dataset_extension/main.py @@ -0,0 +1,87 @@ +import base64 +import random +import string +import subprocess +from argparse import ArgumentParser +import logging +import pathlib + +import yaml + +logging.basicConfig( + format="%(asctime)s | %(levelname)s | %(filename)s:%(lineno)s | %(message)s", + level="INFO") +logger = logging.getLogger(__file__) + +current_path = pathlib.Path(__file__).parent.absolute() +project_path = current_path.parent.absolute() + +temp_dir = current_path / 'tmp' + + +def load_ids(): + snapshot_file = "snapshot.yaml" + with open(project_path / snapshot_file, encoding="utf-8") as f: + snapshot_data = yaml.load(f, Loader=yaml.FullLoader) + return [data['id'] for data in snapshot_data] + + +existing_ids = load_ids() + + +def save_to_yaml(repos_dict, yaml_file): + pathlib.Path(yaml_file).parent.mkdir(parents=True, exist_ok=True) + with open(yaml_file, "w", encoding="utf-8") as f: + yaml.dump(repos_dict, f) + + +def generate_unique_id(): + while True: + unique_id = ''.join(random.choice(string.printable) for _ in range(6)) + encoded_id = base64.b64encode(unique_id.encode()).decode('utf-8') + if encoded_id not in existing_ids: + existing_ids.append(encoded_id) + return encoded_id + + +def download(repo_url): + logger.info(f"Download {repo_url}") + repo_url = repo_url.strip() + ownername, reponame = repo_url.split("/")[-2:] + reponame = reponame.split(".")[0] + pathlib.Path(f"{temp_dir}/{ownername}").mkdir(parents=True, exist_ok=True) + try: + subprocess.check_call(['git', 'clone', repo_url, f"{temp_dir}/{ownername}/{reponame}"]) + commit_sha = (subprocess.check_output(['git', '-C', f'{temp_dir}/{ownername}/{reponame}', 'rev-parse', 'HEAD']) + .decode('ascii').strip()) + try: + tag = subprocess.check_output(['git', '-C', f'{temp_dir}/{ownername}/{reponame}', 'describe', '--long', + '--dirty', '--tags']).decode('ascii').strip() + except subprocess.CalledProcessError: + tag = None + id = generate_unique_id() + logger.info(f"Downloaded {repo_url} {commit_sha}") + return {'id': id, 'url': repo_url, 'sha': commit_sha, 'tag': tag} + except subprocess.CalledProcessError as e: + logger.error(f"Couldn't download repo {temp_dir}/{ownername}/{reponame}. {e}") + + +def download_repos(input_repo_file): + with open(input_repo_file, 'r', encoding='utf-8') as file: + lines = file.readlines() + downloaded_repos = [] + for line in lines: + repo = download(line) + if repo: + downloaded_repos.append(repo) + return downloaded_repos + + +if __name__ == "__main__": + parser = ArgumentParser() + parser.add_argument("--input_repo_file", dest="input_repo_file", required=True, + help="File with list of GitHub repos to be analyzed") + args = parser.parse_args() + logger.info("Start download") + repos = download_repos(args.input_repo_file) + save_to_yaml(repos, current_path / 'result' / 'repos.yaml') From 6ecf52847d6ba45f3e86cf703ffb33ebf493d179 Mon Sep 17 00:00:00 2001 From: "yuliia.t" Date: Mon, 5 Feb 2024 21:38:45 +0200 Subject: [PATCH 2/3] Change tag value --- dataset_extension/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataset_extension/main.py b/dataset_extension/main.py index 4e544002d..40869cdb3 100644 --- a/dataset_extension/main.py +++ b/dataset_extension/main.py @@ -58,7 +58,7 @@ def download(repo_url): tag = subprocess.check_output(['git', '-C', f'{temp_dir}/{ownername}/{reponame}', 'describe', '--long', '--dirty', '--tags']).decode('ascii').strip() except subprocess.CalledProcessError: - tag = None + tag = 'None' id = generate_unique_id() logger.info(f"Downloaded {repo_url} {commit_sha}") return {'id': id, 'url': repo_url, 'sha': commit_sha, 'tag': tag} From ae7444a29e998072c7c7d9a2eea4cad1285606c5 Mon Sep 17 00:00:00 2001 From: "yuliia.t" Date: Wed, 7 Feb 2024 09:49:31 +0200 Subject: [PATCH 3/3] Add run scanners process. Refactoring --- dataset_extension/main.py | 128 ++++++++++++++---- .../reqiurements_dataset_extension.txt | 3 + 2 files changed, 101 insertions(+), 30 deletions(-) create mode 100644 dataset_extension/reqiurements_dataset_extension.txt diff --git a/dataset_extension/main.py b/dataset_extension/main.py index 40869cdb3..3ec63e1b0 100644 --- a/dataset_extension/main.py +++ b/dataset_extension/main.py @@ -1,27 +1,32 @@ import base64 +import hashlib +import os import random +import shutil import string import subprocess -from argparse import ArgumentParser import logging import pathlib - import yaml +from argparse import ArgumentParser +from download_data import get_file_type + logging.basicConfig( - format="%(asctime)s | %(levelname)s | %(filename)s:%(lineno)s | %(message)s", - level="INFO") + format='%(asctime)s | %(levelname)s | %(filename)s:%(lineno)s | %(message)s', + level='INFO') logger = logging.getLogger(__file__) current_path = pathlib.Path(__file__).parent.absolute() project_path = current_path.parent.absolute() - -temp_dir = current_path / 'tmp' +result_path = current_path / 'result' +temp_path = result_path / 'tmp' +data_path = result_path / 'data' +scan_result_path = result_path / 'scan_result' def load_ids(): - snapshot_file = "snapshot.yaml" - with open(project_path / snapshot_file, encoding="utf-8") as f: + with open(project_path / 'snapshot.yaml', encoding='utf-8') as f: snapshot_data = yaml.load(f, Loader=yaml.FullLoader) return [data['id'] for data in snapshot_data] @@ -31,10 +36,21 @@ def load_ids(): def save_to_yaml(repos_dict, yaml_file): pathlib.Path(yaml_file).parent.mkdir(parents=True, exist_ok=True) - with open(yaml_file, "w", encoding="utf-8") as f: + with open(yaml_file, 'w', encoding='utf-8') as f: yaml.dump(repos_dict, f) +def load_from_yaml(yaml_file): + with open(yaml_file, encoding='utf-8') as f: + return yaml.load(f, Loader=yaml.FullLoader) + + +def get_owner_repo_name_from_url(url): + owner_name, repo_name = url.split('/')[-2:] + repo_name = repo_name.split('.')[0] + return owner_name, repo_name + + def generate_unique_id(): while True: unique_id = ''.join(random.choice(string.printable) for _ in range(6)) @@ -44,44 +60,96 @@ def generate_unique_id(): return encoded_id -def download(repo_url): - logger.info(f"Download {repo_url}") +def download_repo(repo_url, base_path): + logger.info(f'Download {repo_url}') repo_url = repo_url.strip() - ownername, reponame = repo_url.split("/")[-2:] - reponame = reponame.split(".")[0] - pathlib.Path(f"{temp_dir}/{ownername}").mkdir(parents=True, exist_ok=True) + owner_name, repo_name = get_owner_repo_name_from_url(repo_url) + pathlib.Path(f'{base_path}/{owner_name}').mkdir(parents=True, exist_ok=True) + repo_path = f'{base_path}/{owner_name}/{repo_name}' try: - subprocess.check_call(['git', 'clone', repo_url, f"{temp_dir}/{ownername}/{reponame}"]) - commit_sha = (subprocess.check_output(['git', '-C', f'{temp_dir}/{ownername}/{reponame}', 'rev-parse', 'HEAD']) - .decode('ascii').strip()) + subprocess.check_call(['git', 'clone', repo_url, repo_path]) + commit_sha = (subprocess.check_output(['git', '-C', repo_path, 'rev-parse', 'HEAD']).decode('ascii').strip()) try: - tag = subprocess.check_output(['git', '-C', f'{temp_dir}/{ownername}/{reponame}', 'describe', '--long', - '--dirty', '--tags']).decode('ascii').strip() + tag = (subprocess.check_output(['git', '-C', repo_path, 'describe', '--long', '--dirty', '--tags']) + .decode('ascii').strip()) except subprocess.CalledProcessError: tag = 'None' - id = generate_unique_id() - logger.info(f"Downloaded {repo_url} {commit_sha}") - return {'id': id, 'url': repo_url, 'sha': commit_sha, 'tag': tag} + logger.info(f'Downloaded {repo_url} {commit_sha}') + return {'id': generate_unique_id(), 'url': repo_url, 'sha': commit_sha, 'tag': tag} except subprocess.CalledProcessError as e: - logger.error(f"Couldn't download repo {temp_dir}/{ownername}/{reponame}. {e}") + logger.error(f"Couldn't download repo {repo_path}. {e}") -def download_repos(input_repo_file): +def download_repos(input_repo_file, dst_path): with open(input_repo_file, 'r', encoding='utf-8') as file: - lines = file.readlines() + urls = file.readlines() downloaded_repos = [] - for line in lines: - repo = download(line) + for url in urls: + repo = download_repo(url, dst_path) if repo: downloaded_repos.append(repo) return downloaded_repos +def hashing_file_names(src_path, dst_path, repos_info): + os.makedirs(dst_path, exist_ok=True) + for i, repo_data in enumerate(repos_info): + new_repo_id = hashlib.sha256(repo_data['id'].encode()).hexdigest()[:8] + logger.info(f'Hash of repo {repo_data["id"]} = {new_repo_id}') + owner_name, repo_name = get_owner_repo_name_from_url(repo_data['url']) + repo_path = f'{src_path}/{owner_name}/{repo_name}' + # Select all files in the repo + repo_files = [os.path.join(root, file) for root, dirs, files in os.walk(repo_path) for file in files] + # Copy files to new dataset location + for j, full_path in enumerate(sorted(list(repo_files))): + short_path = os.path.relpath(full_path, repo_path).replace('\\', '/') + _, file_extension = os.path.splitext(full_path) + file_type = get_file_type(short_path, file_extension) + file_id = hashlib.sha256(short_path.encode()).hexdigest()[:8] + + file_dst_dir = f'{dst_path}/{new_repo_id}/{file_type}' + os.makedirs(file_dst_dir, exist_ok=True) + file_dst_full_path = f'{file_dst_dir}/{file_id}{file_extension}' + shutil.copy(full_path, file_dst_full_path) + logger.info('COPIED FILE: %s -> %s', full_path, file_dst_full_path) + + +def run_credsweeper(data_dir, result_dir): + pathlib.Path(result_dir).mkdir(parents=True, exist_ok=True) + for repo in os.listdir(data_dir): + logger.info(f'Running CredSweeper on {repo}') + repo_path = data_dir / repo + try: + subprocess.check_call(['credsweeper', '--path', repo_path, '--save-json', f'{repo}.json']) + except subprocess.CalledProcessError as e: + logger.error(f"Couldn't run credsweeper for repo {repo}. {e}") + + +def run_detect_secrets(data_dir, result_dir): + pathlib.Path(result_dir).mkdir(parents=True, exist_ok=True) + for repo in os.listdir(data_dir): + logger.info(f'Running DetectSecrets on {repo}') + try: + out = (subprocess.check_output(['detect-secrets', '-C', f'{data_dir}/{repo}/', 'scan', '--all-files']) + .decode()) + with open(f'{result_dir}/{repo}.baseline', 'w') as f: + f.write(out) + except subprocess.CalledProcessError as e: + logger.error(f"Couldn't run detect-secrets for repo {data_dir}/{repo}. {e}") + + if __name__ == "__main__": parser = ArgumentParser() parser.add_argument("--input_repo_file", dest="input_repo_file", required=True, help="File with list of GitHub repos to be analyzed") args = parser.parse_args() - logger.info("Start download") - repos = download_repos(args.input_repo_file) - save_to_yaml(repos, current_path / 'result' / 'repos.yaml') + logger.info('Start download') + repos = download_repos(args.input_repo_file, temp_path) + + save_to_yaml(repos, result_path / 'repos.yaml') + # repos = load_from_yaml(result_path / 'repos.yaml') + + hashing_file_names(temp_path, data_path, repos) + + run_detect_secrets(data_path, scan_result_path / 'detect_secrets') + run_credsweeper(data_path, scan_result_path / 'credsweeper') diff --git a/dataset_extension/reqiurements_dataset_extension.txt b/dataset_extension/reqiurements_dataset_extension.txt new file mode 100644 index 000000000..f57dad4f2 --- /dev/null +++ b/dataset_extension/reqiurements_dataset_extension.txt @@ -0,0 +1,3 @@ +detect-secrets +credentialdigger +credsweeper \ No newline at end of file