From d7aa0dccb271525b7a59ea096dc2d0e13e09ee0b Mon Sep 17 00:00:00 2001 From: Joachim Metz Date: Tue, 2 Jan 2024 09:08:49 +0100 Subject: [PATCH] Worked on LevelDB database format support --- .../LevelDB database format.asciidoc | 199 ++++++++++------ dtformats/leveldb.debug.yaml | 9 + dtformats/leveldb.py | 215 ++++++++++++------ dtformats/leveldb.yaml | 17 ++ scripts/leveldb.py | 7 +- 5 files changed, 303 insertions(+), 144 deletions(-) diff --git a/documentation/LevelDB database format.asciidoc b/documentation/LevelDB database format.asciidoc index 1438236..302e5c3 100644 --- a/documentation/LevelDB database format.asciidoc +++ b/documentation/LevelDB database format.asciidoc @@ -76,160 +76,127 @@ For example: 00000000 4d 41 4e 49 46 45 53 54 2d 30 30 30 30 30 31 0a |MANIFEST-000001.| .... -== Write ahead log file (.ldb) +== Descriptor file -A write ahead log file consists of: +The desriptor file is a <> that consist of: -* one or more 32k pages -** one or more data blocks +* one or more descriptor records -[cols="1,5",options="header"] -|=== -| Characteristics | Description -| Byte order | little-endian -| Date and time values | -| Character strings | -|=== +=== Descriptor record -=== Log block +A descriptor (VersionEdit) record consists of: -A log block is of variable size and consists of: +* One or more descriptor values -[cols="1,1,1,5",options="header"] -|=== -| Offset | Size | Value | Description -| 0 | 4 | | Checksum + -Contains a CRC-32 -| 4 | 2 | | Record data size -| 5 | 1 | | Record type + -See: <> -| 6 | record data size | | Record data -|=== +=== Descriptor value -==== [[log_record_types]]Log record types +A descriptor value consists of: -[cols="1,1,5",options="header"] -|=== -| Value | Identifier | Description -| 1 | FULL | Full record -| 2 | FIRST | First segment of record data -| 3 | MIDDLE | Intermediate segment of record data -| 4 | LAST | Last segment of record data -|=== - -=== Log record - -A log record consists of: - -* One or more tagged values - -Where each tagged values consists of: - -* A <> +* A <> * Value data -==== [[log_value_tags]]Log value tags +==== [[descriptor_value_tags]]Descriptor value tags [cols="1,1,5",options="header"] |=== | Value | Identifier | Description | 1 | kComparator | Comparator + -See: <> +See: <> | 2 | kLogNumber | Log number + -See: <> +See: <> | 3 | kNextFileNumber | Next file number + -See: <> +See: <> | 4 | kLastSequence | Last sequence number + -See: <> +See: <> | 5 | kCompactPointer | Compact pointer + -See: <> +See: <> | 6 | kDeletedFile | Deleted file + -See: <> +See: <> | 7 | kNewFile | New file + -See: <> +See: <> | 8 | | [yellow-background]*Unknown (was used for large value references)* | 9 | kPrevLogNumber | Previous log number + -See: <> +See: <> |=== -==== [[log_comparator_value]]Comparator value +==== [[descriptor_comparator_value]]Comparator value [cols="1,1,1,5",options="header"] |=== | 0 | 1 | 1 | Value tag + Contains a <> + -See: <> +See: <> | 1 | ... | | Name string size | ... | ... | | Name string + Contains an UTF-8 encoded string without end-of-string character |=== -==== [[log_log_number_value]]Log number value +==== [[descriptor_log_number_value]]Log number value [cols="1,1,1,5",options="header"] |=== | 0 | 1 | 2 | Value tag + Contains a <> + -See: <> +See: <> | 1 | ... | | Log number + Contains a <> |=== -==== [[log_next_file_number_value]]Next file number value +==== [[descriptor_next_file_number_value]]Next file number value [cols="1,1,1,5",options="header"] |=== | 0 | 1 | 3 | Value tag + Contains a <> + -See: <> +See: <> | 1 | ... | | Next file number + Contains a <> |=== -==== [[log_last_sequence_number_value]]Last sequence number value +==== [[descriptor_last_sequence_number_value]]Last sequence number value [cols="1,1,1,5",options="header"] |=== | 0 | 1 | 4 | Value tag + Contains a <> + -See: <> +See: <> | 1 | ... | | Last sequence number + Contains a <> |=== -==== [[log_compact_pointer_value]]Compact pointer value +==== [[descriptor_compact_pointer_value]]Compact pointer value [cols="1,1,1,5",options="header"] |=== | 0 | 1 | 5 | Value tag + Contains a <> + -See: <> +See: <> | 1 | ... | | Level + Contains a <> | ... | ... | | Key + -Contains a <> +Contains a <> |=== -==== [[log_deleted_file_value]]Deleted file value +==== [[descriptor_deleted_file_value]]Deleted file value [cols="1,1,1,5",options="header"] |=== | 0 | 1 | 6 | Value tag + Contains a <> + -See: <> +See: <> | 1 | ... | | Level + Contains a <> | ... | ... | | Number of files + Contains a <> |=== -==== [[log_new_file_value]]New file value +==== [[descriptor_new_file_value]]New file value [cols="1,1,1,5",options="header"] |=== | 0 | 1 | 7 | Value tag + Contains a <> + -See: <> +See: <> | 1 | ... | | Level + Contains a <> | ... | ... | | Number of files + @@ -237,23 +204,91 @@ Contains a <> | ... | ... | | File size + Contains a <> | ... | ... | | Smallest record key + -Contains a <> +Contains a <> | ... | ... | | Largest record key + -Contains a <> +Contains a <> |=== -==== [[log_previous_log_number_value]]Previous log number value +==== [[descriptor_previous_log_number_value]]Previous log number value [cols="1,1,1,5",options="header"] |=== | 0 | 1 | 9 | Value tag + Contains a <> + -See: <> +See: <> | 1 | ... | | Previous log number + Contains a <> |=== -==== [[log_key_value]]Key value +== [[log_file]]Write ahead log file (.log) + +A write ahead log file consists of: + +* one or more 32k pages +** one or more log blocks + +[cols="1,5",options="header"] +|=== +| Characteristics | Description +| Byte order | little-endian +| Date and time values | +| Character strings | +|=== + +=== Log block + +A log block is of variable size and consists of: + +[cols="1,1,1,5",options="header"] +|=== +| Offset | Size | Value | Description +| 0 | 4 | | Checksum + +Contains a CRC-32 +| 4 | 2 | | Record data size +| 5 | 1 | | Record type + +See: <> +| 6 | record data size | | Record data +|=== + +==== [[log_record_types]]Log record types + +[cols="1,1,5",options="header"] +|=== +| Value | Identifier | Description +| 1 | FULL | Full record +| 2 | FIRST | First segment of record data +| 3 | MIDDLE | Intermediate segment of record data +| 4 | LAST | Last segment of record data +|=== + +=== Log record + +A log (WriteBatch) record consists of: + +* value header + +==== Log value header + +A log header value is 12 byte in size and consists of: + +[cols="1,1,1,5",options="header"] +|=== +| 0 | 8 | | Sequence number +| 8 | 4 | | [yellow-background]*Unknown (Count?)* +|=== + +==== [[log_value_types]]Log value types + +[cols="1,1,5",options="header"] +|=== +| Value | Identifier | Description +| 0 | kTypeDeletion | Deletion + +See: <> +| 1 | kTypeValue | Put + +See: <> +|=== + +==== [[log_slice_value]]Slice value [cols="1,1,1,5",options="header"] |=== @@ -261,9 +296,29 @@ Contains a <> | ... | ... | | Data |=== +==== [[log_deletion_value]]Deletion value + +[cols="1,1,1,5",options="header"] +|=== +| 0 | 1 | 0 | Value type +| 1 | ... | | Key + +Contains a <> +|=== + +==== [[log_put_value]]Put value + +[cols="1,1,1,5",options="header"] +|=== +| 0 | 1 | 1 | Value type +| 1 | ... | | Key + +Contains a <> +| ... | ... | | Value + +Contains a <> +|=== + == Sorted tables file (.ldb) -A sorted tables file file consists of: +A sorted tables file consists of: * one or more data blocks * one or more metadata blocks diff --git a/dtformats/leveldb.debug.yaml b/dtformats/leveldb.debug.yaml index 9ca9156..cabfaaf 100644 --- a/dtformats/leveldb.debug.yaml +++ b/dtformats/leveldb.debug.yaml @@ -15,6 +15,15 @@ attributes: description: "Record data" format: binary_data --- +data_type_map: leveldb_log_value_header +attributes: +- name: sequence_number + description: "Sequence number" + format: decimal +- name: count + description: "Count" + format: decimal +--- data_type_map: leveldb_table_footer attributes: - name: metaindex_block_offset diff --git a/dtformats/leveldb.py b/dtformats/leveldb.py index 63b26fd..88e220f 100644 --- a/dtformats/leveldb.py +++ b/dtformats/leveldb.py @@ -52,15 +52,9 @@ class LevelDBDatabaseLogFile(data_format.BinaryDataFile): 3: 'MIDDLE', 4: 'LAST'} - _VALUE_TAGS = { - 1: 'kComparator', - 2: 'kLogNumber', - 3: 'kNextFileNumber', - 4: 'kLastSequence', - 5: 'kCompactPointer', - 6: 'kDeletedFile', - 7: 'kNewFile', - 9: 'kPrevLogNumber'} + _VALUE_TYPES = { + 0: 'kTypeDeletion', + 1: 'kTypeValue'} def __init__(self, debug=False, file_system_helper=None, output_writer=None): """Initializes a LevelDB write ahead log file. @@ -98,6 +92,144 @@ def _ReadBlock(self, file_object, file_offset): return block, bytes_read + def _ReadRecord(self, file_offset, data, data_size): + """Reads a record. + + Args: + file_offset (int): offset of the record relative to the start of the file. + data (bytes): record data. + data_size (int): record data size. + + Raises: + ParseError: if the record cannot be read. + """ + # pylint: disable=unused-variable + + if self._debug: + value_string, _ = self._FormatIntegerAsDecimal(file_offset) + self._DebugPrintValue('Offset', value_string) + + value_header, data_offset = self._ReadRecordValueHeader(file_offset, data) + + while data_offset < data_size: + value_type = int(data[data_offset]) + data_offset += 1 + + if self._debug: + value_type_string = self._VALUE_TYPES.get(value_type, 'UNKNOWN') + value_string, _ = self._FormatIntegerAsDecimal(value_type) + self._DebugPrintValue( + 'Value type', f'{value_string:s} ({value_type_string:s})') + + if value_type not in (0, 1): + raise errors.ParseError(f'Unsupported value type: {value_type:d}') + + key, bytes_read = self._ReadRecordValueSlice(data[data_offset:], 'Key') + data_offset += bytes_read + + if value_type == 1: + value, bytes_read = self._ReadRecordValueSlice( + data[data_offset:], 'Value') + data_offset += bytes_read + + def _ReadRecordValueHeader(self, file_offset, data): + """Reads a value header. + + Args: + file_offset (int): offset of the record relative to the start of the file. + data (bytes): record data. + + Returns: + tuple[leveldb_log_value_header, int]: value header and number of bytes + read. + + Raises: + ParseError: if the value header cannot be read. + """ + data_type_map = self._GetDataTypeMap('leveldb_log_value_header') + + value_header = self._ReadStructureFromByteStream( + data, file_offset, data_type_map, 'Value header') + + if self._debug: + debug_info = self._DEBUG_INFORMATION.get('leveldb_log_value_header', None) + self._DebugPrintStructureObject(value_header, debug_info) + + return value_header, 12 + + def _ReadRecordValueSlice(self, data, description): + """Reads a slice record value. + + Args: + data (bytes): value data. + description (str): description of the value. + + Returns: + tuple[bytes, int]: slice value and number of bytes read. + + Raises: + ParseError: if the value cannot be read. + """ + data_size, bytes_read = _ReadVariableSizeInteger(data) + + value_data = data[bytes_read:bytes_read + data_size] + + if self._debug: + value_string, _ = self._FormatIntegerAsDecimal(data_size) + self._DebugPrintValue(f'{description:s} size', value_string) + + self._DebugPrintData(description, value_data) + + return value_data, bytes_read + data_size + + def ReadFileObject(self, file_object): + """Reads a LevelDB write ahead log file-like object. + + Args: + file_object (file): file-like object. + + Raises: + ParseError: if the file cannot be read. + """ + file_offset = 0 + page_size = 32 * 1024 + record_data = b'' + record_offset = 0 + + while file_offset < self._file_size: + log_block, bytes_read = self._ReadBlock(file_object, file_offset) + + if log_block.record_type in (1, 2): + record_data = log_block.record_data + record_offset = file_offset + + elif log_block.record_type in (3, 4): + record_data = b''.join([record_data, log_block.record_data]) + + if log_block.record_type in (1, 4): + self._ReadRecord(record_offset, record_data, len(record_data)) + + file_offset += bytes_read + page_size -= bytes_read + + if page_size <= 6: + file_offset += page_size + page_size = 32 * 1024 + + +class LevelDBDatabaseDescriptorFile(LevelDBDatabaseLogFile): + """LevelDB descriptor file.""" + + _VALUE_TAGS = { + 1: 'kComparator', + 2: 'kLogNumber', + 3: 'kNextFileNumber', + 4: 'kLastSequence', + 5: 'kCompactPointer', + 6: 'kDeletedFile', + 7: 'kNewFile', + 9: 'kPrevLogNumber'} + def _ReadRecord(self, file_offset, data, data_size): """Reads a record. @@ -151,7 +283,7 @@ def _ReadRecord(self, file_offset, data, data_size): data[data_offset:], 'Level') data_offset += bytes_read - key, bytes_read = self._ReadRecordValueKey(data[data_offset:], 'Key') + key, bytes_read = self._ReadRecordValueSlice(data[data_offset:], 'Key') elif value_tag == 6: level, bytes_read = self._ReadRecordValueInteger( @@ -174,11 +306,11 @@ def _ReadRecord(self, file_offset, data, data_size): data[data_offset:], 'File size') data_offset += bytes_read - smallest_record_key, bytes_read = self._ReadRecordValueKey( + smallest_record_key, bytes_read = self._ReadRecordValueSlice( data[data_offset:], 'Smallest record key') data_offset += bytes_read - largest_record_key, bytes_read = self._ReadRecordValueKey( + largest_record_key, bytes_read = self._ReadRecordValueSlice( data[data_offset:], 'Largest record key') elif value_tag == 9: @@ -208,31 +340,6 @@ def _ReadRecordValueInteger(self, data, description): return integer_value, bytes_read - def _ReadRecordValueKey(self, data, description): - """Reads a key record value. - - Args: - data (bytes): value data. - description (str): description of the value. - - Returns: - tuple[bytes, int]: key value and number of bytes read. - - Raises: - ParseError: if the value cannot be read. - """ - data_size, bytes_read = _ReadVariableSizeInteger(data) - - key_data = data[bytes_read:bytes_read + data_size] - - if self._debug: - value_string, _ = self._FormatIntegerAsDecimal(data_size) - self._DebugPrintValue(f'{description:s} key size', value_string) - - self._DebugPrintData(f'{description:s} key', key_data) - - return key_data, bytes_read + data_size - def _ReadRecordValueString(self, data, description): """Reads a string record value. @@ -260,40 +367,6 @@ def _ReadRecordValueString(self, data, description): return string_value, bytes_read + data_size - def ReadFileObject(self, file_object): - """Reads a LevelDB write ahead log file-like object. - - Args: - file_object (file): file-like object. - - Raises: - ParseError: if the file cannot be read. - """ - file_offset = 0 - page_size = 32 * 1024 - record_data = b'' - record_offset = 0 - - while file_offset < self._file_size: - log_block, bytes_read = self._ReadBlock(file_object, file_offset) - - if log_block.record_type in (1, 2): - record_data = log_block.record_data - record_offset = file_offset - - elif log_block.record_type in (3, 4): - record_data = b''.join([record_data, log_block.record_data]) - - if log_block.record_type in (1, 4): - self._ReadRecord(record_offset, record_data, len(record_data)) - - file_offset += bytes_read - page_size -= bytes_read - - if page_size <= 6: - file_offset += page_size - page_size = 32 * 1024 - class LevelDBDatabaseTableFile(data_format.BinaryDataFile): """LevelDB database sorted tables (.ldb) file.""" diff --git a/dtformats/leveldb.yaml b/dtformats/leveldb.yaml index 0821981..dd7f386 100644 --- a/dtformats/leveldb.yaml +++ b/dtformats/leveldb.yaml @@ -33,6 +33,13 @@ attributes: size: 4 units: bytes --- +name: uint64 +type: integer +attributes: + format: unsigned + size: 8 + units: bytes +--- name: leveldb_log_block type: structure attributes: @@ -49,6 +56,16 @@ members: element_data_type: byte elements_data_size: leveldb_log_block.record_data_size --- +name: leveldb_log_value_header +type: structure +attributes: + byte_order: little-endian +members: +- name: sequence_number + data_type: uint64 +- name: count + data_type: uint32 +--- name: leveldb_table_footer type: structure attributes: diff --git a/scripts/leveldb.py b/scripts/leveldb.py index 85bd14b..c9e7a65 100755 --- a/scripts/leveldb.py +++ b/scripts/leveldb.py @@ -80,11 +80,16 @@ def Main(): finally: file_object.close() - # TODO: add support for descriptor file + path_segments = file_system_helper.SplitPath(options.source) if file_signature == b'\x57\xfb\x80\x8b\x24\x75\x47\xdb': leveldb_file = leveldb.LevelDBDatabaseTableFile( debug=options.debug, output_writer=output_writer) + + elif path_segments[-1].startswith('MANIFEST'): + leveldb_file = leveldb.LevelDBDatabaseDescriptorFile( + debug=options.debug, output_writer=output_writer) + else: leveldb_file = leveldb.LevelDBDatabaseLogFile( debug=options.debug, output_writer=output_writer)