From 291a041e9f48f996db4d53efc450ef12a6c80da6 Mon Sep 17 00:00:00 2001 From: kvothe Date: Thu, 19 Dec 2024 14:04:24 +0100 Subject: [PATCH] Add the information about match context to the database (#439) Add the information about match context to the database --- ...0_added_context_column_into_match_table.py | 26 +++++++++ src/models/match.py | 3 + src/tasks.py | 56 +++++++++++++++++-- 3 files changed, 81 insertions(+), 4 deletions(-) create mode 100644 src/migrations/versions/f623e1057b00_added_context_column_into_match_table.py diff --git a/src/migrations/versions/f623e1057b00_added_context_column_into_match_table.py b/src/migrations/versions/f623e1057b00_added_context_column_into_match_table.py new file mode 100644 index 00000000..32310fb0 --- /dev/null +++ b/src/migrations/versions/f623e1057b00_added_context_column_into_match_table.py @@ -0,0 +1,26 @@ +"""Added context column into match table +Revision ID: f623e1057b00 +Revises: 6b495d5a4855 +Create Date: 2024-11-13 15:14:14.618258 +""" +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = "f623e1057b00" +down_revision = "702d19cfa063" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.add_column("match", sa.Column("context", sa.JSON(), nullable=False)) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column("match", "context") + # ### end Alembic commands ### diff --git a/src/models/match.py b/src/models/match.py index fde9af51..bfca3b3f 100644 --- a/src/models/match.py +++ b/src/models/match.py @@ -22,3 +22,6 @@ class Match(SQLModel, table=True): ) ) job: Job = Relationship(back_populates="matches") + context: Dict[str, Dict[str, Dict[str, str]]] = Field( + sa_column=Column(JSON, nullable=False) + ) diff --git a/src/tasks.py b/src/tasks.py index 4a0a88bf..90d94c0b 100644 --- a/src/tasks.py +++ b/src/tasks.py @@ -1,4 +1,5 @@ -from typing import List, Optional, cast +import base64 +from typing import List, Optional, cast, Dict import logging from rq import get_current_job, Queue # type: ignore from redis import Redis @@ -68,7 +69,12 @@ def get_datasets(self) -> List[str]: return list(result["result"]["datasets"].keys()) def update_metadata( - self, job: JobId, orig_name: str, path: str, matches: List[str] + self, + job: JobId, + orig_name: str, + path: str, + matches: List[str], + context: Dict[str, Dict[str, Dict[str, str]]], ) -> None: """Saves matches to the database, and runs appropriate metadata plugins. @@ -93,7 +99,9 @@ def update_metadata( del metadata["path"] # Update the database. - match = Match(file=orig_name, meta=metadata, matches=matches) + match = Match( + file=orig_name, meta=metadata, matches=matches, context=context + ) self.db.add_match(job, match) def execute_yara(self, job: Job, files: List[str]) -> None: @@ -108,10 +116,18 @@ def execute_yara(self, job: Job, files: List[str]) -> None: path = self.plugins.filter(orig_name) if not path: continue + matches = rule.match(path) if matches: + with open(path, "rb") as file: + data = file.read() + self.update_metadata( - job.id, orig_name, path, [r.rule for r in matches] + job.id, + orig_name, + path, + [r.rule for r in matches], + get_match_contexts(data, matches), ) num_matches += 1 except yara.Error: @@ -290,3 +306,35 @@ def run_yara_batch(job_id: JobId, iterator: str, batch_size: int) -> None: agent.execute_yara(job, pop_result.files) agent.add_tasks_in_progress(job, -1) + + +def get_match_contexts( + data: bytes, matches: List[yara.Match] +) -> Dict[str, Dict[str, Dict[str, str]]]: + context = {} + for yara_match in matches: + match_context = {} + for string_match in yara_match.strings: + first = string_match.instances[0] + + (before, matching, after) = read_bytes_with_context( + data, first.offset, first.matched_length + ) + match_context[string_match.identifier] = { + "before": base64.b64encode(before).decode("utf-8"), + "matching": base64.b64encode(matching).decode("utf-8"), + "after": base64.b64encode(after).decode("utf-8"), + } + + context[yara_match.rule] = match_context + return context + + +def read_bytes_with_context( + data: bytes, offset: int, length: int, context: int = 32 +) -> tuple[bytes, bytes, bytes]: + """Return `matched_length` bytes from `offset`, along with `byte_range` bytes before and after the match.""" + before = data[max(0, offset - context) : offset] + matching = data[offset : offset + length] + after = data[offset + length : offset + length + context] + return before, matching, after