Add LZO decompression (#14)

(DIS-1698)
fox-it · Dec 15, 2022 · 7870af8 · 7870af8
1 parent 630f01f
commit 7870af8
Show file tree

Hide file tree

Showing 2 changed files with 166 additions and 1 deletion.
diff --git a/dissect/util/compression/lzo.py b/dissect/util/compression/lzo.py
@@ -0,0 +1,115 @@
+import io
+import struct
+from typing import BinaryIO, Union
+
+
+def _count_zeroes(src: BinaryIO):
+    length = 0
+    val = src.read(1)[0]
+    while val == 0:
+        length += 255
+        val = src.read(1)[0]
+        if length > 2**20:
+            raise ValueError("Too many zeroes")
+
+    return length + val
+
+
+def _copy_block(src: BinaryIO, dst: bytearray, length: int, distance: int, trailing: int):
+    remaining = length
+
+    block = dst[-distance:]
+    remaining -= len(block)
+    while remaining > 0:
+        add = block[:remaining]
+        remaining -= len(add)
+        block += add
+
+    dst.extend(block[:length])
+    dst.extend(src.read(trailing))
+
+
+def decompress(src: Union[bytes, BinaryIO], header: bool = True, buflen: int = -1) -> bytes:
+    """LZO decompress from a file-like object or bytes. Assumes no header.
+
+    Arguments are largely compatible with python-lzo API.
+
+    Args:
+        src: File-like object or bytes to decompress.
+        header: Whether the metadata header is included in the input.
+        buflen: If ``header`` is ``False``, a buffer length in bytes must be given that will fit the output.
+
+    Returns:
+        The decompressed data.
+    """
+    if not hasattr(src, "read"):
+        src = io.BytesIO(src)
+
+    dst = bytearray()
+
+    if header:
+        byte = src.read(1)[0]
+        if byte not in [0xF0, 0xF1]:
+            raise ValueError("Invalid header value")
+        out_len = struct.unpack("<I", src.read(4))
+    else:
+        out_len = buflen
+
+    val = src.read(1)[0]
+    if val == 0x10:
+        raise ValueError("LZOv1")
+    elif val >= 0x12:
+        dst += src.read(val - 0x11)
+        val = src.read(1)[0]
+
+    trailing = 0
+    while True:
+        if val <= 0xF:
+            if not trailing:
+                if val == 0:
+                    dst += src.read(_count_zeroes(src) + 18)
+                else:
+                    dst += src.read(val + 3)
+            else:
+                h = src.read(1)[0]
+                dist = (h << 2) + (val >> 2) + 1
+                length = 2
+                trailing = val & 3
+                _copy_block(src, dst, length, dist, trailing)
+        elif val <= 0x1F:
+            if val & 7 == 0:
+                length = 9 + _count_zeroes(src)
+            else:
+                length = (val & 7) + 2
+            ds = struct.unpack("<H", src.read(2))[0]
+            dist = 16384 + ((val & 8) >> 3) + (ds >> 2)
+            if dist == 16384:
+                break
+            trailing = ds & 3
+            _copy_block(src, dst, length, dist, trailing)
+        elif val <= 0x3F:
+            length = val & 31
+            if length == 0:
+                length = _count_zeroes(src) + 31
+            length += 2
+            ds = struct.unpack("<H", src.read(2))[0]
+            dist = 1 + (ds >> 2)
+            trailing = ds & 3
+            _copy_block(src, dst, length, dist, trailing)
+        else:
+            if val <= 0x7F:
+                length = 3 + ((val >> 5) & 1)
+            else:
+                length = 5 + ((val >> 5) & 3)
+            h = src.read(1)[0]
+            d = (val >> 2) & 7
+            dist = (h << 3) + d + 1
+            trailing = val & 3
+            _copy_block(src, dst, length, dist, trailing)
+
+        if len(dst) == out_len:
+            break
+
+        val = src.read(1)[0]
+
+    return bytes(dst)
diff --git a/tests/test_compression.py b/tests/test_compression.py
@@ -1,4 +1,13 @@
-from dissect.util.compression import lz4, lznt1, lzxpress_huffman, lzxpress, sevenbit
+import hashlib
+
+from dissect.util.compression import (
+    lz4,
+    lznt1,
+    lzo,
+    lzxpress,
+    lzxpress_huffman,
+    sevenbit,
+)
 
 
 def test_lz4_decompress():
@@ -20,6 +29,47 @@ def test_lznt1_decompress():
     )
 
 
+def test_lzo_decompress():
+    assert (
+        lzo.decompress(bytes.fromhex("0361626361626320f314000f616263616263616263616263616263616263110000"), False)
+        == b"abc" * 100
+    )
+
+    assert (
+        hashlib.sha256(
+            lzo.decompress(
+                bytes.fromhex(
+                    "160900a40100400003a83e8e6302003800007104ff4000fc012add00032016dd"
+                    "00042016dd00052016dd00062016dd00072016dd00082016dd00092016dd000a"
+                    "2016dd000b2016dd000c2016dd000d2016dd000e2016dd000f2016dd00102016"
+                    "dd00112016dd00122016dd00132016dd00142016dd00152016dd00162016dd00"
+                    "172016dd00182016dd00192016dd001a2016dd001b2016dd001c2016dd001d20"
+                    "16dd001e2016dd001f2016dd00202016dd00212016dd00222016dd00232016dd"
+                    "00242016dd00252016dd00262016dd00272016dd00282016dd00292016dd002a"
+                    "2016dd002b2016dd002c2016dd002d2016dd002e2016dd002f2016dd00302016"
+                    "dd00312016dd00322016dd00332016dd00342016dd00352016dd00362016dd00"
+                    "372016dd00382016dd00392016dd003a2016dd003b2016dd003c2016dd003d20"
+                    "16dd003e2016dd003f2016dd00402016dd00412016dd00422016dd00432016dd"
+                    "00442016dd00452016dd00462016dd00472016dd00482016dd00492016dd004a"
+                    "2016dd004b2016dd004c2016dd004d2016dd004e2016dd004f2016dd00502016"
+                    "dd00512016dd00522016dd00532016dd00542016dd00552016dd00562016dd00"
+                    "572016dd00582016dd00592016dd005a2016dd005b2016dd005c2016dd005d20"
+                    "16dd005e2016dd005f2016dd00602016dd00612016dd00622016dd00632016dd"
+                    "00642016dd0065200adf000800ed27dc006001228d57e32501556c29dc00fd0b"
+                    "f55d04662b5c00307d010031dd004f5d06675c0027ce06c03f3b5e02e4022059"
+                    "0e00880228dd02115d16682002bc03ff020a00ff8902c75d0669dc0322dc5507"
+                    "736d616c6c2d66696c652a9500d455046ad404229455016469722f6f045f3639"
+                    "2a9a00eb4209096bd80422b0526804082d776974682d78617474722a1e077543"
+                    "080a7c3622cd5d91cd126d9500e0943a110000"
+                ),
+                False,
+                8192,
+            )
+        ).hexdigest()
+        == "a4d6951085717a9698cd814899d11c931db1d4c0f7ddc3b1cba0f582142d4cf4"
+    )
+
+
 def test_lzxpress_huffman_decompress():
     assert (
         lzxpress_huffman.decompress(