From 291a041e9f48f996db4d53efc450ef12a6c80da6 Mon Sep 17 00:00:00 2001
From: kvothe <michal_krzem@wp.pl>
Date: Thu, 19 Dec 2024 14:04:24 +0100
Subject: [PATCH] Add the information about match context to the database
 (#439)

Add the information about match context to the database
---
 ...0_added_context_column_into_match_table.py | 26 +++++++++
 src/models/match.py                           |  3 +
 src/tasks.py                                  | 56 +++++++++++++++++--
 3 files changed, 81 insertions(+), 4 deletions(-)
 create mode 100644 src/migrations/versions/f623e1057b00_added_context_column_into_match_table.py

diff --git a/src/migrations/versions/f623e1057b00_added_context_column_into_match_table.py b/src/migrations/versions/f623e1057b00_added_context_column_into_match_table.py
new file mode 100644
index 00000000..32310fb0
--- /dev/null
+++ b/src/migrations/versions/f623e1057b00_added_context_column_into_match_table.py
@@ -0,0 +1,26 @@
+"""Added context column into match table
+Revision ID: f623e1057b00
+Revises: 6b495d5a4855
+Create Date: 2024-11-13 15:14:14.618258
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = "f623e1057b00"
+down_revision = "702d19cfa063"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.add_column("match", sa.Column("context", sa.JSON(), nullable=False))
+    # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_column("match", "context")
+    # ### end Alembic commands ###
diff --git a/src/models/match.py b/src/models/match.py
index fde9af51..bfca3b3f 100644
--- a/src/models/match.py
+++ b/src/models/match.py
@@ -22,3 +22,6 @@ class Match(SQLModel, table=True):
         )
     )
     job: Job = Relationship(back_populates="matches")
+    context: Dict[str, Dict[str, Dict[str, str]]] = Field(
+        sa_column=Column(JSON, nullable=False)
+    )
diff --git a/src/tasks.py b/src/tasks.py
index 4a0a88bf..90d94c0b 100644
--- a/src/tasks.py
+++ b/src/tasks.py
@@ -1,4 +1,5 @@
-from typing import List, Optional, cast
+import base64
+from typing import List, Optional, cast, Dict
 import logging
 from rq import get_current_job, Queue  # type: ignore
 from redis import Redis
@@ -68,7 +69,12 @@ def get_datasets(self) -> List[str]:
         return list(result["result"]["datasets"].keys())
 
     def update_metadata(
-        self, job: JobId, orig_name: str, path: str, matches: List[str]
+        self,
+        job: JobId,
+        orig_name: str,
+        path: str,
+        matches: List[str],
+        context: Dict[str, Dict[str, Dict[str, str]]],
     ) -> None:
         """Saves matches to the database, and runs appropriate metadata
         plugins.
@@ -93,7 +99,9 @@ def update_metadata(
         del metadata["path"]
 
         # Update the database.
-        match = Match(file=orig_name, meta=metadata, matches=matches)
+        match = Match(
+            file=orig_name, meta=metadata, matches=matches, context=context
+        )
         self.db.add_match(job, match)
 
     def execute_yara(self, job: Job, files: List[str]) -> None:
@@ -108,10 +116,18 @@ def execute_yara(self, job: Job, files: List[str]) -> None:
                 path = self.plugins.filter(orig_name)
                 if not path:
                     continue
+
                 matches = rule.match(path)
                 if matches:
+                    with open(path, "rb") as file:
+                        data = file.read()
+
                     self.update_metadata(
-                        job.id, orig_name, path, [r.rule for r in matches]
+                        job.id,
+                        orig_name,
+                        path,
+                        [r.rule for r in matches],
+                        get_match_contexts(data, matches),
                     )
                     num_matches += 1
             except yara.Error:
@@ -290,3 +306,35 @@ def run_yara_batch(job_id: JobId, iterator: str, batch_size: int) -> None:
 
         agent.execute_yara(job, pop_result.files)
         agent.add_tasks_in_progress(job, -1)
+
+
+def get_match_contexts(
+    data: bytes, matches: List[yara.Match]
+) -> Dict[str, Dict[str, Dict[str, str]]]:
+    context = {}
+    for yara_match in matches:
+        match_context = {}
+        for string_match in yara_match.strings:
+            first = string_match.instances[0]
+
+            (before, matching, after) = read_bytes_with_context(
+                data, first.offset, first.matched_length
+            )
+            match_context[string_match.identifier] = {
+                "before": base64.b64encode(before).decode("utf-8"),
+                "matching": base64.b64encode(matching).decode("utf-8"),
+                "after": base64.b64encode(after).decode("utf-8"),
+            }
+
+            context[yara_match.rule] = match_context
+    return context
+
+
+def read_bytes_with_context(
+    data: bytes, offset: int, length: int, context: int = 32
+) -> tuple[bytes, bytes, bytes]:
+    """Return `matched_length` bytes from `offset`, along with `byte_range` bytes before and after the match."""
+    before = data[max(0, offset - context) : offset]
+    matching = data[offset : offset + length]
+    after = data[offset + length : offset + length + context]
+    return before, matching, after