Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add various small bug fixes #406

Merged
merged 10 commits into from
Sep 2, 2024
46 changes: 19 additions & 27 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,9 +1,23 @@
# General
site
**/merge.log
.kaybee
*/cpu.prof
*.pyc
*git-cache*
*.log
*.log.*
**/cov_html
.coverage
similarities.csv

# Virtual environment
**/.venv/

# VSCode Settings
**/.vscode/

# Regarding KB
.kaybee
kaybee/internal/repository/profile001.pdf
kaybee/internal/repository/repository.test
kaybee/internal/tasks/.kaybee
Expand All @@ -12,10 +26,9 @@ kaybee/internal/tasks/profile001.pdf
kaybee/internal/tasks/tasks.test
kaybee/internal/repository/cpu.prof
kaybee/kaybee.code-workspace
.vscode/launch.json
.vscode/task.code-snippets
kaybee/coverage.out
kaybee/kaybee
kaybee/internal/reconcile/debug.test
kaybee/internal/.kaybee/**/*
kaybee/dist/**
kaybee/kaybeeconf.yaml
Expand All @@ -25,50 +38,29 @@ kaybee/steady.sh
kaybee/kaybeeconf-custom.yaml
kaybee/kaybee-new-statements
kaybee/pkged.go
*.log
*.log.*
kaybeeconf.yaml

# Regarding Prospector
prospector/.env
prospector/workspace.code-workspace
prospector/disabled_tests/skip_test-commits.db
prospector/disabled_tests/skip_test-vulnerabilities.db
prospector/tracer_dataset_final_2
prospector/results
prospector/*.py
prospector/.vscode/launch.json
prospector/.vscode/settings.json
prospector/install_fastext.sh
prospector/nvd.ipynb
prospector/data/nvd.pkl
prospector/data/nvd.csv
prospector/data_sources/reports
.vscode/settings.json
prospector/cov_html/*
prospector/client/cli/cov_html/*
prospector/config.yaml
prospector/client/web/node-app/node_modules
prospector/.coverage.*
prospector/.coverage
**/cov_html
prospector/cov_html
.coverage
prospector/.venv
prospector/prospector.code-workspace
prospector/requests-cache.sqlite
prospector/prospector-report.html
prospector/test_report.html
prospector/test_report.json
prospector/.idea/*
similarities.csv
prospector/*.html
prospector/*.json
requests-cache.sqlite
prospector/output.png
prospector/output.pstats
prospector/kaybee-new-statements
prospector/run.sh
prospector/cve_data
prospector/evaluation
.DS_Store
kaybee/internal/reconcile/debug.test
prospector/client/web/node-app/build
.DS_Store
8 changes: 7 additions & 1 deletion prospector/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,13 @@ def main(argv): # noqa: C901
"enabled_rules": config.enabled_rules,
}

results, advisory_record = prospector(**params)
try:
results, advisory_record = prospector(**params)
except Exception as e:
ConsoleWriter.print(
f"Prospector function couldn't return successfully with {config.vuln_id}: {e}\n"
)
return

if config.preprocess_only:
return
Expand Down
47 changes: 34 additions & 13 deletions prospector/core/prospector.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,9 @@ def prospector( # noqa: C901
)
sys.exit(1)

fixing_commit = advisory_record.get_fixing_commit()
commits_in_advisory_references = (
advisory_record.get_commits_in_advisory_references()
)
# print(advisory_record.references)
# obtain a repository object
repository = Git(repository_url, git_cache)
Expand All @@ -131,10 +133,12 @@ def prospector( # noqa: C901

candidates: Dict[str, RawCommit] = dict()

if len(fixing_commit) > 0:
candidates = get_commits_no_tags(repository, fixing_commit)
if len(commits_in_advisory_references) > 0:
candidates = get_commits_no_tags(
repository, commits_in_advisory_references
)
if len(candidates) > 0 and any(
[c for c in candidates if c in fixing_commit]
[c for c in candidates if c in commits_in_advisory_references]
):
console.print("Fixing commit found in the advisory references\n")
advisory_record.has_fixing_commit = True
Expand Down Expand Up @@ -170,10 +174,8 @@ def prospector( # noqa: C901
f"Number of candidates exceeds {limit_candidates}, aborting."
)

ConsoleWriter.print(
f"Candidates limitlimit exceeded: {len(candidates)}."
)
return None, len(candidates)
ConsoleWriter.print(f"Candidates limit exceeded: {len(candidates)}.")
raise Exception(f"Candidate limit exceeded: {len(candidates)}.")

with ExecutionTimer(
core_statistics.sub_collection("commit preprocessing")
Expand Down Expand Up @@ -228,7 +230,7 @@ def prospector( # noqa: C901
elapsed_time = time.time() - start_time
if elapsed_time > 1800:
logger.error("Processing timeout")
return None, len(candidates)
raise Exception("Processing timeout")

else:
writer.print("\nAll commits found in the backend")
Expand All @@ -244,15 +246,26 @@ def prospector( # noqa: C901
):
save_or_update_processed_commits(backend_address, payload)
else:
logger.warning("Preprocessed commits are not being sent to backend")
logger.warning(
"Preprocessed commits are not being sent to backend (after phase 1)"
)

ranked_candidates = evaluate_commits(
preprocessed_commits, advisory_record, backend_address, enabled_rules
preprocessed_commits,
advisory_record,
use_backend,
backend_address,
enabled_rules,
)

# Save outcome of security relevance to DB (Phase 2 Rule)
payload = [c.to_dict() for c in ranked_candidates[:NUM_COMMITS_PHASE_2]]
save_or_update_processed_commits(backend_address, payload)
if len(payload) > 0 and use_backend != USE_BACKEND_NEVER:
save_or_update_processed_commits(backend_address, payload)
else:
logger.warning(
"Preprocessed commits are not being sent to backend (after phase 2)"
)

# ConsoleWriter.print("Commit ranking and aggregation...")
ranked_candidates = remove_twins(ranked_candidates)
Expand Down Expand Up @@ -296,6 +309,7 @@ def filter(commits: Dict[str, RawCommit]) -> Dict[str, RawCommit]:
def evaluate_commits(
commits: List[Commit],
advisory: AdvisoryRecord,
use_backend: str,
backend_address: str,
enabled_rules: List[str],
) -> List[Commit]:
Expand All @@ -316,8 +330,15 @@ def evaluate_commits(
"""
with ExecutionTimer(core_statistics.sub_collection("candidates analysis")):
with ConsoleWriter("Candidate analysis") as _:
# Pass True to the rules module if the backend is being used, False
# otherwise (needed to decide whether to update the database)
use_backend = use_backend != USE_BACKEND_NEVER
ranked_commits = apply_rules(
commits, advisory, backend_address, enabled_rules=enabled_rules
commits,
advisory,
use_backend,
backend_address,
enabled_rules=enabled_rules,
)

return ranked_commits
Expand Down
55 changes: 43 additions & 12 deletions prospector/datamodel/advisory.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,9 @@

def parse_references_from_third_party(self):
"""Parse the references from third party sites"""
for ref in self.search_references_debian() + self.search_references_redhat():
for ref in (
self.search_references_debian() + self.search_references_redhat()
):
# self.references[ref] += 2
self.references[self.extract_hashes(ref)] += 2

Expand Down Expand Up @@ -167,7 +169,9 @@
# )
self.versions = {
"affected": [
item.get("versionEndIncluding", item.get("versionStartIncluding"))
item.get(
"versionEndIncluding", item.get("versionStartIncluding")
)
for item in data["configurations"][0]["nodes"][0]["cpeMatch"]
], # TODO: can return to tuples
"fixed": [
Expand All @@ -178,22 +182,40 @@
self.versions["affected"] = [
v for v in self.versions["affected"] if v is not None
]
self.versions["fixed"] = [v for v in self.versions["fixed"] if v is not None]
self.versions["fixed"] = [
v for v in self.versions["fixed"] if v is not None
]

def get_commits_in_advisory_references(self) -> List[str]:
"""Processes the advisory's references to extract commit IDs if
present. Only keeps the five most important ones.

def get_fixing_commit(self) -> List[str]:
Returns:
A list of references to a commit.
"""
self.references = dict(
sorted(self.references.items(), key=lambda item: item[1], reverse=True)
sorted(
self.references.items(), key=lambda item: item[1], reverse=True
)
)
limit = 0
while len([r for r in self.references.keys() if r.startswith("commit::")]) > 5:
while (
len([r for r in self.references.keys() if r.startswith("commit::")])
> 5
):
self.references = {
k: v
for k, v in self.references.items()
if ("commit" in k and v > limit) or ("commit" not in k)
}
limit += 1

return [ref.split("::")[1] for ref in self.references if "commit::" in ref]
return [
ref.split("::")[1]
for ref in self.references
if "commit::" in ref
and ref.split("::")[1] not in ["master", "main"]

Check warning on line 217 in prospector/datamodel/advisory.py

View check run for this annotation

In Solidarity / Inclusive Language

Match Found

Please consider an alternative to `master`. Possibilities include: `primary`, `main`, `leader`, `active`, `writer`
Raw output
/\b(?!masterdata|masterdata\w+\b)master/gi
]

def search_references_debian(self) -> List[str]:
url = "https://security-tracker.debian.org/tracker/"
Expand Down Expand Up @@ -221,7 +243,9 @@

return []

def extract_hashes(self, reference: str, filter: bool = False) -> str | None:
def extract_hashes(
self, reference: str, filter: bool = False
) -> str | None:
if bool(re.search(r"a=commit;", reference)):
return "commit::" + re.search(r";h=(\w{6,40})", reference).group(1)

Expand Down Expand Up @@ -258,12 +282,15 @@
for field, key in timestamp_fields.items():
timestamp = metadata.get(key)
setattr(
self, field, int(isoparse(timestamp).timestamp()) if timestamp else None
self,
field,
int(isoparse(timestamp).timestamp()) if timestamp else None,
)
if not self.description:
self.description = details["descriptions"][0]["value"]
self.references = defaultdict(
int, {self.extract_hashes(r["url"]): 2 for r in details["references"]}
int,
{self.extract_hashes(r["url"]): 2 for r in details["references"]},
)


Expand All @@ -290,7 +317,9 @@
headers = {"apiKey": NVD_API_KEY} if NVD_API_KEY else None
params = {"cveId": cve_id}

response = requests.get(NVD_REST_ENDPOINT, headers=headers, params=params)
response = requests.get(
NVD_REST_ENDPOINT, headers=headers, params=params
)

if response.status_code != 200:
return None
Expand All @@ -314,7 +343,9 @@
return False


def get_from_local(vuln_id: str, nvd_rest_endpoint: str = LOCAL_NVD_REST_ENDPOINT):
def get_from_local(
vuln_id: str, nvd_rest_endpoint: str = LOCAL_NVD_REST_ENDPOINT
):
try:
response = requests.get(nvd_rest_endpoint + vuln_id)
if response.status_code != 200:
Expand Down
7 changes: 7 additions & 0 deletions prospector/docker/worker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -73,4 +73,11 @@ VOLUME [ "/data_sources/reports" ]

RUN chmod +x /usr/local/bin/start_rq_worker.sh
#CMD tail -f /dev/null

# Create directory for gitcache and run git config command to avoid 'dubious ownership' error
RUN mkdir -p /tmp/gitcache && \
cd /tmp/gitcache && \
git config --global --add safe.directory '*'


ENTRYPOINT ["/usr/local/bin/start_rq_worker.sh"]
6 changes: 4 additions & 2 deletions prospector/git/git.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,8 @@ def parse_git_output(self, raw: List[str]) -> Dict[str, RawCommit]:
return commits

def find_commits_for_twin_lookups(self, commit_id):
"""Finds all relevant commits around the given commit wihin a time
window of 10 days. Search is narrowed if too many commits are found."""
# Using both author date and commit date we should cover all cases.
try:
commit_timestamp_a = self.get_timestamp(commit_id, "a")
Expand All @@ -329,8 +331,8 @@ def find_commits_for_twin_lookups(self, commit_id):

return dict()

except Exception:
logger.error("Git command failed, cannot get commits", exc_info=True)
except Exception as e:
logger.error(f"Git command failed, cannot get commits: {e}")

return dict()

Expand Down
12 changes: 2 additions & 10 deletions prospector/llm/llm_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,17 +116,9 @@ def classify_commit(
except Exception as e:
raise RuntimeError(f"Prompt-model chain could not be invoked: {e}")

if is_relevant in [
"True",
"ANSWER:True",
"```ANSWER:True```",
]:
if "True" in is_relevant:
return True
elif is_relevant in [
"False",
"ANSWER:False",
"```ANSWER:False```",
]:
elif "False" in is_relevant:
return False
else:
raise RuntimeError(
Expand Down
Loading
Loading