Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve XLSX adapter #137

Merged
merged 2 commits into from
Sep 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 97 additions & 18 deletions flow/record/adapter/xlsx.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,14 @@
import openpyxl
from base64 import b64decode, b64encode
from datetime import datetime, timezone
from typing import Any, Iterator

from openpyxl import Workbook, load_workbook
from openpyxl.cell.cell import ILLEGAL_CHARACTERS_RE

from flow import record
from flow.record import fieldtypes
from flow.record.adapter import AbstractReader, AbstractWriter
from flow.record.fieldtypes.net import ipaddress
from flow.record.selector import make_selector
from flow.record.utils import is_stdout

Expand All @@ -14,23 +21,72 @@
"""


def sanitize_fieldvalues(values: Iterator[Any]) -> Iterator[Any]:
"""Sanitize field values so openpyxl will accept them."""

for value in values:
# openpyxl doesn't support timezone-aware datetime instances,
# so we convert to UTC and then remove the timezone info.
if isinstance(value, datetime) and value.tzinfo is not None:
value = value.astimezone(timezone.utc).replace(tzinfo=None)

elif type(value) in [ipaddress, list, fieldtypes.posix_path, fieldtypes.windows_path]:
value = str(value)

elif isinstance(value, bytes):
base64_encode = False
try:
new_value = 'b"' + value.decode() + '"'
if ILLEGAL_CHARACTERS_RE.search(new_value):
base64_encode = True
else:
value = new_value
except UnicodeDecodeError:
base64_encode = True

Check warning on line 45 in flow/record/adapter/xlsx.py

View check run for this annotation

Codecov / codecov/patch

flow/record/adapter/xlsx.py#L44-L45

Added lines #L44 - L45 were not covered by tests
if base64_encode:
value = "base64:" + b64encode(value).decode()

yield value


class XlsxWriter(AbstractWriter):
fp = None
wb = None

def __init__(self, path, **kwargs):
self.fp = record.open_path_or_stream(path, "wb")
self.wb = openpyxl.Workbook()
self.wb = Workbook()

Check warning on line 58 in flow/record/adapter/xlsx.py

View check run for this annotation

Codecov / codecov/patch

flow/record/adapter/xlsx.py#L58

Added line #L58 was not covered by tests
self.ws = self.wb.active
self.desc = None
# self.ws.title = "Records"

# Remove the active work sheet, every Record Descriptor will have its own sheet.
self.wb.remove(self.ws)
self.descs = []
self._last_dec = None

Check warning on line 64 in flow/record/adapter/xlsx.py

View check run for this annotation

Codecov / codecov/patch

flow/record/adapter/xlsx.py#L62-L64

Added lines #L62 - L64 were not covered by tests

def write(self, r):
if not self.desc:
self.desc = r._desc
self.ws.append(r._desc.fields)
if r._desc not in self.descs:
self.descs.append(r._desc)
ws = self.wb.create_sheet(r._desc.name.strip().replace("/", "-"))
field_types = []
field_names = []

Check warning on line 71 in flow/record/adapter/xlsx.py

View check run for this annotation

Codecov / codecov/patch

flow/record/adapter/xlsx.py#L67-L71

Added lines #L67 - L71 were not covered by tests

for field_name, field in r._desc.get_all_fields().items():
field_types.append(field.typename)
field_names.append(field_name)

Check warning on line 75 in flow/record/adapter/xlsx.py

View check run for this annotation

Codecov / codecov/patch

flow/record/adapter/xlsx.py#L73-L75

Added lines #L73 - L75 were not covered by tests

ws.append(field_types)
ws.append(field_names)

Check warning on line 78 in flow/record/adapter/xlsx.py

View check run for this annotation

Codecov / codecov/patch

flow/record/adapter/xlsx.py#L77-L78

Added lines #L77 - L78 were not covered by tests

if r._desc != self._last_dec:
self._last_dec = r._desc
self.ws = self.wb[r._desc.name.strip().replace("/", "-")]

Check warning on line 82 in flow/record/adapter/xlsx.py

View check run for this annotation

Codecov / codecov/patch

flow/record/adapter/xlsx.py#L80-L82

Added lines #L80 - L82 were not covered by tests

values = list(sanitize_fieldvalues(value for value in r._asdict().values()))

Check warning on line 84 in flow/record/adapter/xlsx.py

View check run for this annotation

Codecov / codecov/patch

flow/record/adapter/xlsx.py#L84

Added line #L84 was not covered by tests

self.ws.append(r._asdict().values())
try:
self.ws.append(values)
except ValueError as e:
raise ValueError(f"Unable to write values to workbook: {str(e)}")

Check warning on line 89 in flow/record/adapter/xlsx.py

View check run for this annotation

Codecov / codecov/patch

flow/record/adapter/xlsx.py#L86-L89

Added lines #L86 - L89 were not covered by tests

def flush(self):
if self.wb:
Expand All @@ -53,7 +109,7 @@
self.selector = make_selector(selector)
self.fp = record.open_path_or_stream(path, "rb")
self.desc = None
self.wb = openpyxl.load_workbook(self.fp)
self.wb = load_workbook(self.fp)

Check warning on line 112 in flow/record/adapter/xlsx.py

View check run for this annotation

Codecov / codecov/patch

flow/record/adapter/xlsx.py#L112

Added line #L112 was not covered by tests
self.ws = self.wb.active

def close(self):
Expand All @@ -62,12 +118,35 @@
self.fp = None

def __iter__(self):
desc = None
for row in self.ws.rows:
if not desc:
desc = record.RecordDescriptor([col.value.replace(" ", "_").lower() for col in row])
continue

obj = desc(*[col.value for col in row])
if not self.selector or self.selector.match(obj):
yield obj
for worksheet in self.wb.worksheets:
desc = None
desc_name = worksheet.title.replace("-", "/")
field_names = None
field_types = None
for row in worksheet:
if field_types is None:
field_types = [col.value for col in row if col.value]
continue
if field_names is None:
field_names = [

Check warning on line 131 in flow/record/adapter/xlsx.py

View check run for this annotation

Codecov / codecov/patch

flow/record/adapter/xlsx.py#L121-L131

Added lines #L121 - L131 were not covered by tests
col.value.replace(" ", "_").lower()
for col in row
if col.value and not col.value.startswith("_")
]
desc = record.RecordDescriptor(desc_name, list(zip(field_types, field_names)))
continue

Check warning on line 137 in flow/record/adapter/xlsx.py

View check run for this annotation

Codecov / codecov/patch

flow/record/adapter/xlsx.py#L136-L137

Added lines #L136 - L137 were not covered by tests

record_values = []
for idx, col in enumerate(row):
value = col.value
if field_types[idx] == "bytes":
if value[1] == '"': # If so, we know this is b""

Check warning on line 143 in flow/record/adapter/xlsx.py

View check run for this annotation

Codecov / codecov/patch

flow/record/adapter/xlsx.py#L139-L143

Added lines #L139 - L143 were not covered by tests
# Cut of the b" at the start and the trailing "
value = value[2:-1].encode()

Check warning on line 145 in flow/record/adapter/xlsx.py

View check run for this annotation

Codecov / codecov/patch

flow/record/adapter/xlsx.py#L145

Added line #L145 was not covered by tests
else:
# If not, we know it is base64 encoded (so we cut of the starting 'base64:')
value = b64decode(value[7:])
record_values.append(value)
obj = desc(*record_values)
if not self.selector or self.selector.match(obj):
yield obj

Check warning on line 152 in flow/record/adapter/xlsx.py

View check run for this annotation

Codecov / codecov/patch

flow/record/adapter/xlsx.py#L148-L152

Added lines #L148 - L152 were not covered by tests
55 changes: 55 additions & 0 deletions tests/test_xlsx_adapter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import re
import sys
from datetime import datetime, timedelta, timezone
from typing import Iterator
from unittest.mock import MagicMock

import pytest

from flow.record import fieldtypes


@pytest.fixture
def mock_openpyxl_package(monkeypatch: pytest.MonkeyPatch) -> Iterator[MagicMock]:
with monkeypatch.context() as m:
mock_openpyxl = MagicMock()
mock_cell = MagicMock()
mock_cell.ILLEGAL_CHARACTERS_RE = re.compile(r"[\000-\010]|[\013-\014]|[\016-\037]")
m.setitem(sys.modules, "openpyxl", mock_openpyxl)
m.setitem(sys.modules, "openpyxl.cell.cell", mock_cell)

yield mock_openpyxl


def test_sanitize_field_values(mock_openpyxl_package):
from flow.record.adapter.xlsx import sanitize_fieldvalues

assert list(
sanitize_fieldvalues(
[
7,
datetime(1920, 11, 11, 13, 37, 0, tzinfo=timezone(timedelta(hours=2))),
"James",
b"Bond",
b"\x00\x07",
fieldtypes.net.ipaddress("13.37.13.37"),
["Shaken", "Not", "Stirred"],
fieldtypes.posix_path("/home/user"),
fieldtypes.posix_command("/bin/bash -c 'echo hello world'"),
fieldtypes.windows_path("C:\\Users\\user\\Desktop"),
fieldtypes.windows_command("C:\\Some.exe /?"),
]
)
) == [
7,
datetime(1920, 11, 11, 11, 37, 0), # UTC normalization
"James",
'b"Bond"', # When possible, encode bytes in a printable way
"base64:AAc=", # If not, base64 encode
"13.37.13.37", # Stringify an ip address
"['Shaken', 'Not', 'Stirred']", # Stringify a list
"/home/user", # Stringify a posix path
"/bin/bash -c 'echo hello world'", # Stringify a posix command
"C:\\Users\\user\\Desktop", # Stringify a windows path
"C:\\Some.exe /?", # Stringify a windows command
]
Loading