Skip to content

Commit

Permalink
Merge pull request #22 from mkrd/improve-partial
Browse files Browse the repository at this point in the history
Improve partial
  • Loading branch information
mkrd authored Nov 8, 2022
2 parents d24426f + 23f7dc0 commit 1684200
Show file tree
Hide file tree
Showing 8 changed files with 171 additions and 120 deletions.
25 changes: 25 additions & 0 deletions dictdatabase/indexing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import orjson
import os
from . import config


class Indexer:

def __init__(self, db_name: str):
db_name = db_name.replace("/", "___")
path = os.path.join(config.storage_directory, ".ddb", f"{db_name}.index")
self.path = path
os.makedirs(os.path.dirname(path), exist_ok=True)
if not os.path.exists(self.path):
self.data = {}
else:
with open(self.path, "rb") as f:
self.data = orjson.loads(f.read())

def get(self, key):
return self.data.get(key, None)

def write(self, key, start_index, end_index, indent_level, indent_with, value_hash):
self.data[key] = [start_index, end_index, indent_level, indent_with, value_hash]
with open(self.path, "wb") as f:
f.write(orjson.dumps(self.data))
57 changes: 57 additions & 0 deletions dictdatabase/io_bytes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import zlib
import os
from . import config, utils


def read(db_name: str, start=None, end=None) -> bytes:
"""
Read the content of a db as as bytes. Reading works even when the config
changes, so a compressed ddb file can also be read if compression is
disabled, and vice versa.
"""
json_path, json_exists, ddb_path, ddb_exists = utils.db_paths(db_name)

if json_exists:
if ddb_exists:
raise FileExistsError(f"DB Inconsistency: \"{db_name}\" exists as .json and .ddb")
with open(json_path, "rb") as f:
if start is not None and end is not None:
f.seek(start)
return f.read(end - start)
return f.read()
if not ddb_exists:
raise FileNotFoundError(f"DB does not exist: \"{db_name}\"")
with open(ddb_path, "rb") as f:
json_bytes = zlib.decompress(f.read())
if start is not None and end is not None:
return json_bytes[start:end]
return json_bytes


def write(db_name: str, dump: bytes):
"""
Write the bytes to the file of the db_path.
If the db was compressed but now config.use_compression is False,
remove the compressed file, and vice versa.
"""
json_path, json_exists, ddb_path, ddb_exists = utils.db_paths(db_name)
# Write bytes or string to file
remove_this = None
if config.use_compression:
write_path = ddb_path
if json_exists:
remove_this = json_path
dump = zlib.compress(dump, 1)
else:
write_path = json_path
if ddb_exists:
remove_this = ddb_path

# Write bytes or string to file
with open(write_path, "wb") as f:
f.write(dump)

# Remove the other file if it exists
# This is done after writing to avoid data loss
if remove_this is not None:
os.remove(remove_this)
2 changes: 1 addition & 1 deletion dictdatabase/io_safe.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def partial_read(db_name: str, key: str):
if not json_exists and not ddb_exists:
return None
with locking.ReadLock(db_name):
return io_unsafe.partial_read(db_name, key)
return io_unsafe.partial_read_only(db_name, key)


def write(db_name: str, data: dict):
Expand Down
178 changes: 69 additions & 109 deletions dictdatabase/io_unsafe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,8 @@
from dataclasses import dataclass
import orjson
import json
import zlib
import os
import hashlib
from pathlib import Path
from . import config, utils, byte_codes
from . import config, utils, byte_codes, indexing, io_bytes



Expand All @@ -24,61 +21,74 @@ class PartialFileHandle:
partial_dict: PartialDict
indent_level: int
indent_with: str
index_data: dict
indexer: indexing.Indexer


################################################################################
#### Reading
################################################################################


def read_bytes(db_name: str) -> bytes:
"""
Read the content of a db as a string, or as bytes if as_bytes=True.
Reading works even when the config changes, so a compressed ddb file can
also be read if compression is disabled, and vice versa.
"""
json_path, json_exists, ddb_path, ddb_exists = utils.db_paths(db_name)

if json_exists:
if ddb_exists:
raise FileExistsError(f"DB Inconsistency: \"{db_name}\" exists as .json and .ddb")
with open(json_path, "rb") as f:
return f.read()
if not ddb_exists:
raise FileNotFoundError(f"DB does not exist: \"{db_name}\"")
with open(ddb_path, "rb") as f:
return zlib.decompress(f.read())


def read(db_name: str) -> dict:
"""
Read the file at db_path from the configured storage directory.
Make sure the file exists. If it does notnot a FileNotFoundError is
raised.
"""
# Always use orjson to read the file, because it is faster
return orjson.loads(read_bytes(db_name))
return orjson.loads(io_bytes.read(db_name))


def try_read_bytes_by_index(indexer: indexing.Indexer, db_name, key):
if (index := indexer.get(key)) is None:
return None
start_index, end_index, _, _, value_hash = index
partial_bytes = io_bytes.read(db_name, start_index, end_index)
if value_hash != hashlib.sha256(partial_bytes).hexdigest():
return None
return orjson.loads(partial_bytes)


def partial_read_only(db_name: str, key: str) -> dict:
"""
Partially read a key from a db.
The key MUST be unique in the entire db, otherwise the behavior is undefined.
This is a lot faster than reading the entire db, because it does not parse
the entire file, but only the part <value> part of the <key>: <value> pair.
If the key is not found, a `KeyError` is raised.
"""


# Search for key in the index file
indexer = indexing.Indexer(db_name)

if (value_data := try_read_bytes_by_index(indexer, db_name, key)) is not None:
return value_data

# Not found in index file, search for key in the entire file
file_bytes = io_bytes.read(db_name)
key_start, key_end = utils.find_outermost_key_in_json_bytes(file_bytes, key)

def read_index_file(db_name: str) -> dict:
path = f"{config.storage_directory}/.ddb/{db_name.replace('/', '___')}.index"
Path(path).parent.mkdir(parents=True, exist_ok=True)
if not os.path.exists(path):
return {}
with open(path, "rb") as f:
return orjson.loads(f.read())
if key_end == -1:
raise KeyError(f"Key \"{key}\" not found in db \"{db_name}\"")

# Key found, now determine the bounds of the value
space_after_semicolon = 1 if file_bytes[key_end] == byte_codes.SPACE else 0
value_start = key_end + space_after_semicolon
value_end = utils.seek_index_through_value_bytes(file_bytes, value_start)

indent_level, indent_with = utils.detect_indentation_in_json_bytes(file_bytes, key_start)
value_bytes = file_bytes[value_start:value_end]

def write_index_file(index_data: dict, db_name: str, key, start_index, end_index, indent_level, indent_with, value_hash):
path = f"{config.storage_directory}/.ddb/{db_name.replace('/', '___')}.index"
index_data[key] = [start_index, end_index, indent_level, indent_with, value_hash]
with open(path, "wb") as f:
f.write(orjson.dumps(index_data))
# Write key info to index file
indexer.write(key, value_start, value_end, indent_level, indent_with,
hashlib.sha256(value_bytes).hexdigest()
)
return orjson.loads(value_bytes)


def partial_read(db_name: str, key: str, as_handle=False) -> PartialFileHandle | dict:
def get_partial_file_handle(db_name: str, key: str) -> PartialFileHandle:
"""
Partially read a key from a db.
The key MUST be unique in the entire db, otherwise the behavior is undefined.
Expand All @@ -88,89 +98,45 @@ def partial_read(db_name: str, key: str, as_handle=False) -> PartialFileHandle |
If the key is not found, a `KeyError` is raised.
"""

data = read_bytes(db_name)
data = io_bytes.read(db_name)

# Search for key in the index file
index_data = read_index_file(db_name)
index = index_data.get(key, None)
indexer = indexing.Indexer(db_name)
index = indexer.get(key)
if index is not None:
start_index, end_index, indent_level, indent_with, value_hash = index
partial_bytes = data[start_index:end_index]
partial_bytes_hash = hashlib.sha256(partial_bytes).hexdigest()
if value_hash == partial_bytes_hash:
if value_hash == hashlib.sha256(partial_bytes).hexdigest():
partial_value = orjson.loads(partial_bytes)
if not as_handle:
return partial_value
partial_dict = PartialDict(data[:start_index], key, partial_value, data[end_index:])
return PartialFileHandle(db_name, partial_dict, indent_level, indent_with, index_data)
return PartialFileHandle(db_name, partial_dict, indent_level, indent_with, indexer)

# Not found in index file, search for key in the entire file
json_key = f"\"{key}\":".encode()
json_key_start_index = utils.find_outermost_key_index_in_json_bytes(data, json_key)
json_key_end_index = json_key_start_index + len(json_key)
key_start, key_end = utils.find_outermost_key_in_json_bytes(data, key)

if json_key_start_index == -1:
if key_end == -1:
raise KeyError(f"Key \"{key}\" not found in db \"{db_name}\"")

# Key found, now determine the bounds of the value
space_after_semicolon = 1 if data[json_key_end_index] == byte_codes.SPACE else 0
value_start_index = json_key_end_index + space_after_semicolon
value_end_index = utils.seek_index_through_value_bytes(data, value_start_index)
space_after_semicolon = 1 if data[key_end] == byte_codes.SPACE else 0
value_start = key_end + space_after_semicolon
value_end = utils.seek_index_through_value_bytes(data, value_start)

indent_level, indent_with = utils.detect_indentation_in_json_bytes(data, json_key_start_index)
partial_bytes = data[value_start_index:value_end_index]
indent_level, indent_with = utils.detect_indentation_in_json_bytes(data, key_start)
partial_bytes = data[value_start:value_end]

# Write key info to index file
write_index_file(
index_data,
db_name,
key,
value_start_index,
value_end_index,
indent_level,
indent_with,
hashlib.sha256(partial_bytes).hexdigest()
)

partial_value = orjson.loads(partial_bytes)
if not as_handle:
return partial_value

partial_dict = PartialDict(data[:value_start_index], key, partial_value, data[value_end_index:])
return PartialFileHandle(db_name, partial_dict, indent_level, indent_with, index_data)
partial_dict = PartialDict(data[:value_start], key, partial_value, data[value_end:])
return PartialFileHandle(db_name, partial_dict, indent_level, indent_with, indexer)


################################################################################
#### Writing
################################################################################


def write_bytes(db_name: str, dump: bytes):
"""
Write the bytes to the file of the db_path.
If the db was compressed but now config.use_compression is False,
remove the compressed file, and vice versa.
"""
json_path, json_exists, ddb_path, ddb_exists = utils.db_paths(db_name)
# Write bytes or string to file
if config.use_compression:
write_path = ddb_path
if json_exists:
os.remove(json_path)
else:
write_path = json_path
if ddb_exists:
os.remove(ddb_path)

# Compress if required
if config.use_compression:
dump = zlib.compress(dump, 1)

# Write bytes or string to file
with open(write_path, "wb") as f:
f.write(dump)


def write(db_name: str, data: dict):
"""
Write the dict db dumped as a json string
Expand All @@ -183,8 +149,7 @@ def write(db_name: str, data: dict):
else:
db_dump = json.dumps(data, indent=config.indent, sort_keys=config.sort_keys)
db_dump = db_dump.encode()

write_bytes(db_name, db_dump)
io_bytes.write(db_name, db_dump)


def partial_write(pf: PartialFileHandle):
Expand All @@ -199,20 +164,15 @@ def partial_write(pf: PartialFileHandle):
else:
partial_dump = json.dumps(pf.partial_dict.value, indent=config.indent, sort_keys=config.sort_keys)
partial_dump = partial_dump.encode()

# Add indentation
if pf.indent_level > 0 and pf.indent_with:
replace_this = "\n".encode()
replace_with = ("\n" + (pf.indent_level * pf.indent_with)).encode()
partial_dump = partial_dump.replace(replace_this, replace_with)

write_index_file(
pf.index_data,
pf.db_name,
pf.partial_dict.key,
len(pf.partial_dict.prefix),
len(pf.partial_dict.prefix) + len(partial_dump),
pf.indent_level,
pf.indent_with,
hashlib.sha256(partial_dump).hexdigest()
pf.indexer.write(pf.partial_dict.key, len(pf.partial_dict.prefix),
len(pf.partial_dict.prefix) + len(partial_dump), pf.indent_level,
pf.indent_with, hashlib.sha256(partial_dump).hexdigest()
)

write_bytes(pf.db_name, pf.partial_dict.prefix + partial_dump + pf.partial_dict.suffix)
io_bytes.write(pf.db_name, pf.partial_dict.prefix + partial_dump + pf.partial_dict.suffix)
2 changes: 1 addition & 1 deletion dictdatabase/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def __enter__(self) -> Tuple["DDBSession", JSONSerializable | T]:
elif self.op_type.file_key:
self.write_lock = locking.WriteLock(self.db_name)
self.write_lock._lock()
self.partial_handle = io_unsafe.partial_read(self.db_name, self.key, as_handle=True)
self.partial_handle = io_unsafe.get_partial_file_handle(self.db_name, self.key)
self.data_handle = self.partial_handle.partial_dict.value

elif self.op_type.file_where:
Expand Down
Loading

0 comments on commit 1684200

Please sign in to comment.