Skip to content

Commit

Permalink
Improve support searching indexes
Browse files Browse the repository at this point in the history
  • Loading branch information
Schamper committed May 15, 2024
1 parent b28a604 commit f1dfe62
Show file tree
Hide file tree
Showing 18 changed files with 566 additions and 195 deletions.
156 changes: 156 additions & 0 deletions dissect/esedb/btree.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
from __future__ import annotations

from typing import TYPE_CHECKING

from dissect.esedb.exceptions import KeyNotFoundError, NoNeighbourPageError
from dissect.esedb.page import Node, Page

if TYPE_CHECKING:
from dissect.esedb.esedb import EseDB


class BTree:
"""A simple implementation for searching the ESE B+Trees.
This is a stateful interactive class that moves an internal cursor to a position within the BTree.
Args:
esedb: An instance of :class:`~dissect.esedb.esedb.EseDB`.
page: The page to open a BTree on.
"""

def __init__(self, esedb: EseDB, root: int | Page):
self.esedb = esedb

if isinstance(root, int):
page_num = root
root = esedb.page(page_num)
else:
page_num = root.num

self.root = root

self._page = root
self._page_num = page_num
self._node_num = 0

def reset(self) -> None:
"""Reset the internal state to the root of the BTree."""
self._page = self.root
self._page_num = self._page.num
self._node_num = 0

def node(self) -> Node:
"""Return the node the BTree is currently on."""
return self._page.node(self._node_num)

def next(self) -> Node:
"""Move the BTree to the next node and return it.
Can move the BTree to the next page as a side effect.
"""
if self._node_num + 1 > self._page.node_count - 1:
self.next_page()
else:
self._node_num += 1

return self.node()

def next_page(self) -> None:
"""Move the BTree to the next page in the tree.
Raises:
NoNeighbourPageError: If the current page has no next page.
"""
if self._page.next_page:
self._page = self.esedb.page(self._page.next_page)
self._node_num = 0
else:
raise NoNeighbourPageError(f"{self._page} has no next page")

def prev(self) -> Node:
"""Move the BTree to the previous node and return it.
Can move the BTree to the previous page as a side effect.
"""
if self._node_num - 1 < 0:
self.prev_page()
else:
self._node_num -= 1

return self.node()

def prev_page(self) -> None:
"""Move the BTree to the previous page in the tree.
Raises:
NoNeighbourPageError: If the current page has no previous page.
"""
if self._page.previous_page:
self._page = self.esedb.page(self._page.previous_page)
self._node_num = self._page.node_count - 1
else:
raise NoNeighbourPageError(f"{self._page} has no previous page")

def search(self, key: bytes, exact: bool = True) -> Node:
"""Search the tree for the given key.
Moves the BTree to the matching node, or on the last node that is less than the requested key.
Args:
key: The key to search for.
exact: Whether to only return successfully on an exact match.
Raises:
KeyNotFoundError: If an ``exact`` match was requested but not found.
"""
page = self._page
while True:
node = find_node(page, key)

if page.is_branch:
page = self.esedb.page(node.child)
else:
self._page = page
self._page_num = page.num
self._node_num = node.num
break

if exact and key != node.key:
raise KeyNotFoundError(f"Can't find key: {key}")

return self.node()


def find_node(page: Page, key: bytes) -> Node:
"""Search a page for a node matching ``key``.
Args:
page: The page to search.
key: The key to search.
"""
first_node_idx = 0
last_node_idx = page.node_count - 1

node = None
while first_node_idx < last_node_idx:
node_idx = (first_node_idx + last_node_idx) // 2
node = page.node(node_idx)

# It turns out that the way BTree keys are compared matches 1:1 with how Python compares bytes
# First compare data, then length
if key < node.key:
last_node_idx = node_idx
elif key == node.key:
if page.is_branch:
# If there's an exact match on a key on a branch page, the actual leaf nodes are in the next branch
# Page keys for branch pages appear to be non-inclusive upper bounds
node_idx = min(node_idx + 1, page.node_count - 1)
node = page.node(node_idx)

return node
else:
first_node_idx = node_idx + 1

# We're at the last node
return page.node(first_node_idx)
38 changes: 33 additions & 5 deletions dissect/esedb/c_esedb.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,8 @@
import struct
import uuid
from collections import namedtuple
from typing import Union

from dissect import cstruct
from dissect.cstruct import cstruct

# https://github.com/microsoft/Extensible-Storage-Engine
c_esedb_def = """
Expand Down Expand Up @@ -425,10 +424,37 @@
DotNetGuid = 0x00040000, // index over GUID column according to .Net GUID sort order
ImmutableStructure = 0x00080000, // Do not write to the input structures during a JetCreateIndexN call.
};
flag IDBFLAG : uint16 {
Unique = 0x0001, // Duplicate keys not allowed
AllowAllNulls = 0x0002, // Make entries for NULL keys (all segments are null)
AllowFirstNull = 0x0004, // First index column NULL allowed in index
AllowSomeNulls = 0x0008, // Make entries for keys with some null segments
NoNullSeg = 0x0010, // Don't allow a NULL key segment
Primary = 0x0020, // Index is the primary index
LocaleSet = 0x0040, // Index locale information (locale name) is set (JET_bitIndexUnicode was specified).
Multivalued = 0x0080, // Has a multivalued segment
TemplateIndex = 0x0100, // Index of a template table
DerivedIndex = 0x0200, // Index derived from template table
// Note that this flag is persisted, but
// never used in an in-memory IDB, because
// we use the template index IDB instead.
LocalizedText = 0x0400, // Has a unicode text column? (code page is 1200)
SortNullsHigh = 0x0800, // NULL sorts after data
// Jan 2012: MSU is being removed. fidbUnicodeFixupOn should no longer be referenced.
UnicodeFixupOn_Deprecated = 0x1000, // Track entries with undefined Unicode codepoints
CrossProduct = 0x2000, // all combinations of multi-valued columns are indexed
DisallowTruncation = 0x4000, // fail update rather than allow key truncation
NestedTable = 0x8000, // combinations of multi-valued columns of same itagSequence are indexed
};
flag IDXFLAG : uint16 {
ExtendedColumns = 0x0001, // IDXSEGs are comprised of JET_COLUMNIDs, not FIDs
DotNetGuid = 0x0002, // GUIDs sort according to .Net rules
};
""" # noqa E501

c_esedb = cstruct.cstruct()
c_esedb.load(c_esedb_def)
c_esedb = cstruct().load(c_esedb_def)

ulDAEMagic = 0x89ABCDEF
pgnoFDPMSO = 4
Expand All @@ -444,14 +470,16 @@
TAGFLD_HEADER = c_esedb.TAGFLD_HEADER
CODEPAGE = c_esedb.CODEPAGE
COMPRESSION_SCHEME = c_esedb.COMPRESSION_SCHEME
IDBFLAG = c_esedb.IDBFLAG
IDXFLAG = c_esedb.IDXFLAG

CODEPAGE_MAP = {
CODEPAGE.UNICODE: "utf-16-le",
CODEPAGE.WESTERN: "cp1252",
CODEPAGE.ASCII: "ascii",
}

RecordValue = Union[int, float, str, bytes, datetime.datetime, None]
RecordValue = int | float | str | bytes | datetime.datetime | None


def decode_bit(buf: bytes) -> bool:
Expand Down
3 changes: 1 addition & 2 deletions dissect/esedb/compression.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import struct
from typing import Optional

from dissect.util.compression import lzxpress, sevenbit

Expand Down Expand Up @@ -29,7 +28,7 @@ def decompress(buf: bytes) -> bytes:
return buf


def decompress_size(buf: bytes) -> Optional[int]:
def decompress_size(buf: bytes) -> int | None:
"""Return the decompressed size of the given bytes according to the encoded compression scheme.
Args:
Expand Down
Loading

0 comments on commit f1dfe62

Please sign in to comment.