Merge pull request #22 from mkrd/improve-partial

Improve partial
mkrd · Nov 8, 2022 · 1684200 · 1684200
2 parents d24426f + 23f7dc0
commit 1684200
Show file tree

Hide file tree

Showing 8 changed files with 171 additions and 120 deletions.
diff --git a/dictdatabase/indexing.py b/dictdatabase/indexing.py
@@ -0,0 +1,25 @@
+import orjson
+import os
+from . import config
+
+
+class Indexer:
+
+	def __init__(self, db_name: str):
+		db_name = db_name.replace("/", "___")
+		path = os.path.join(config.storage_directory, ".ddb", f"{db_name}.index")
+		self.path = path
+		os.makedirs(os.path.dirname(path), exist_ok=True)
+		if not os.path.exists(self.path):
+			self.data = {}
+		else:
+			with open(self.path, "rb") as f:
+				self.data = orjson.loads(f.read())
+
+	def get(self, key):
+		return self.data.get(key, None)
+
+	def write(self, key, start_index, end_index, indent_level, indent_with, value_hash):
+		self.data[key] = [start_index, end_index, indent_level, indent_with, value_hash]
+		with open(self.path, "wb") as f:
+			f.write(orjson.dumps(self.data))
diff --git a/dictdatabase/io_bytes.py b/dictdatabase/io_bytes.py
@@ -0,0 +1,57 @@
+import zlib
+import os
+from . import config, utils
+
+
+def read(db_name: str, start=None, end=None) -> bytes:
+	"""
+		Read the content of a db as as bytes. Reading works even when the config
+		changes, so a compressed ddb file can also be read if compression is
+		disabled, and vice versa.
+	"""
+	json_path, json_exists, ddb_path, ddb_exists = utils.db_paths(db_name)
+
+	if json_exists:
+		if ddb_exists:
+			raise FileExistsError(f"DB Inconsistency: \"{db_name}\" exists as .json and .ddb")
+		with open(json_path, "rb") as f:
+			if start is not None and end is not None:
+				f.seek(start)
+				return f.read(end - start)
+			return f.read()
+	if not ddb_exists:
+		raise FileNotFoundError(f"DB does not exist: \"{db_name}\"")
+	with open(ddb_path, "rb") as f:
+		json_bytes = zlib.decompress(f.read())
+		if start is not None and end is not None:
+			return json_bytes[start:end]
+		return json_bytes
+
+
+def write(db_name: str, dump: bytes):
+	"""
+		Write the bytes to the file of the db_path.
+		If the db was compressed but now config.use_compression is False,
+		remove the compressed file, and vice versa.
+	"""
+	json_path, json_exists, ddb_path, ddb_exists = utils.db_paths(db_name)
+	# Write bytes or string to file
+	remove_this = None
+	if config.use_compression:
+		write_path = ddb_path
+		if json_exists:
+			remove_this = json_path
+		dump = zlib.compress(dump, 1)
+	else:
+		write_path = json_path
+		if ddb_exists:
+			remove_this = ddb_path
+
+	# Write bytes or string to file
+	with open(write_path, "wb") as f:
+		f.write(dump)
+
+	# Remove the other file if it exists
+	# This is done after writing to avoid data loss
+	if remove_this is not None:
+		os.remove(remove_this)
diff --git a/dictdatabase/io_safe.py b/dictdatabase/io_safe.py
@@ -21,7 +21,7 @@ def partial_read(db_name: str, key: str):
 	if not json_exists and not ddb_exists:
 		return None
 	with locking.ReadLock(db_name):
-		return io_unsafe.partial_read(db_name, key)
+		return io_unsafe.partial_read_only(db_name, key)
 
 
 def write(db_name: str, data: dict):

diff --git a/dictdatabase/io_unsafe.py b/dictdatabase/io_unsafe.py
@@ -2,11 +2,8 @@
 from dataclasses import dataclass
 import orjson
 import json
-import zlib
-import os
 import hashlib
-from pathlib import Path
-from . import config, utils, byte_codes
+from . import config, utils, byte_codes, indexing, io_bytes
 
 
 
@@ -24,61 +21,74 @@ class PartialFileHandle:
 	partial_dict: PartialDict
 	indent_level: int
 	indent_with: str
-	index_data: dict
+	indexer: indexing.Indexer
 
 
 ################################################################################
 #### Reading
 ################################################################################
 
 
-def read_bytes(db_name: str) -> bytes:
-	"""
-		Read the content of a db as a string, or as bytes if as_bytes=True.
-		Reading works even when the config changes, so a compressed ddb file can
-		also be read if compression is disabled, and vice versa.
-	"""
-	json_path, json_exists, ddb_path, ddb_exists = utils.db_paths(db_name)
-
-	if json_exists:
-		if ddb_exists:
-			raise FileExistsError(f"DB Inconsistency: \"{db_name}\" exists as .json and .ddb")
-		with open(json_path, "rb") as f:
-			return f.read()
-	if not ddb_exists:
-		raise FileNotFoundError(f"DB does not exist: \"{db_name}\"")
-	with open(ddb_path, "rb") as f:
-		return zlib.decompress(f.read())
-
-
 def read(db_name: str) -> dict:
 	"""
 		Read the file at db_path from the configured storage directory.
 		Make sure the file exists. If it does notnot a FileNotFoundError is
 		raised.
 	"""
 	# Always use orjson to read the file, because it is faster
-	return orjson.loads(read_bytes(db_name))
+	return orjson.loads(io_bytes.read(db_name))
+
+
+def try_read_bytes_by_index(indexer: indexing.Indexer, db_name, key):
+	if (index := indexer.get(key)) is None:
+		return None
+	start_index, end_index, _, _, value_hash = index
+	partial_bytes = io_bytes.read(db_name, start_index, end_index)
+	if value_hash != hashlib.sha256(partial_bytes).hexdigest():
+		return None
+	return orjson.loads(partial_bytes)
+
+
+def partial_read_only(db_name: str, key: str) -> dict:
+	"""
+		Partially read a key from a db.
+		The key MUST be unique in the entire db, otherwise the behavior is undefined.
+		This is a lot faster than reading the entire db, because it does not parse
+		the entire file, but only the part <value> part of the <key>: <value> pair.
+
+		If the key is not found, a `KeyError` is raised.
+	"""
+
+
+	# Search for key in the index file
+	indexer = indexing.Indexer(db_name)
 
+	if (value_data := try_read_bytes_by_index(indexer, db_name, key)) is not None:
+		return value_data
 
+	# Not found in index file, search for key in the entire file
+	file_bytes = io_bytes.read(db_name)
+	key_start, key_end = utils.find_outermost_key_in_json_bytes(file_bytes, key)
 
-def read_index_file(db_name: str) -> dict:
-	path = f"{config.storage_directory}/.ddb/{db_name.replace('/', '___')}.index"
-	Path(path).parent.mkdir(parents=True, exist_ok=True)
-	if not os.path.exists(path):
-		return {}
-	with open(path, "rb") as f:
-		return orjson.loads(f.read())
+	if key_end == -1:
+		raise KeyError(f"Key \"{key}\" not found in db \"{db_name}\"")
+
+	# Key found, now determine the bounds of the value
+	space_after_semicolon = 1 if file_bytes[key_end] == byte_codes.SPACE else 0
+	value_start = key_end + space_after_semicolon
+	value_end = utils.seek_index_through_value_bytes(file_bytes, value_start)
 
+	indent_level, indent_with  = utils.detect_indentation_in_json_bytes(file_bytes, key_start)
+	value_bytes = file_bytes[value_start:value_end]
 
-def write_index_file(index_data: dict, db_name: str, key, start_index, end_index, indent_level, indent_with, value_hash):
-	path = f"{config.storage_directory}/.ddb/{db_name.replace('/', '___')}.index"
-	index_data[key] = [start_index, end_index, indent_level, indent_with, value_hash]
-	with open(path, "wb") as f:
-		f.write(orjson.dumps(index_data))
+	# Write key info to index file
+	indexer.write(key, value_start, value_end, indent_level, indent_with,
+		hashlib.sha256(value_bytes).hexdigest()
+	)
+	return orjson.loads(value_bytes)
 
 
-def partial_read(db_name: str, key: str, as_handle=False) -> PartialFileHandle | dict:
+def get_partial_file_handle(db_name: str, key: str) -> PartialFileHandle:
 	"""
 		Partially read a key from a db.
 		The key MUST be unique in the entire db, otherwise the behavior is undefined.
@@ -88,89 +98,45 @@ def partial_read(db_name: str, key: str, as_handle=False) -> PartialFileHandle |
 		If the key is not found, a `KeyError` is raised.
 	"""
 
-	data = read_bytes(db_name)
+	data = io_bytes.read(db_name)
 
 	# Search for key in the index file
-	index_data = read_index_file(db_name)
-	index = index_data.get(key, None)
+	indexer = indexing.Indexer(db_name)
+	index = indexer.get(key)
 	if index is not None:
 		start_index, end_index, indent_level, indent_with, value_hash = index
 		partial_bytes = data[start_index:end_index]
-		partial_bytes_hash = hashlib.sha256(partial_bytes).hexdigest()
-		if value_hash == partial_bytes_hash:
+		if value_hash == hashlib.sha256(partial_bytes).hexdigest():
 			partial_value = orjson.loads(partial_bytes)
-			if not as_handle:
-				return partial_value
 			partial_dict = PartialDict(data[:start_index], key, partial_value, data[end_index:])
-			return PartialFileHandle(db_name, partial_dict, indent_level, indent_with, index_data)
+			return PartialFileHandle(db_name, partial_dict, indent_level, indent_with, indexer)
 
 	# Not found in index file, search for key in the entire file
-	json_key = f"\"{key}\":".encode()
-	json_key_start_index = utils.find_outermost_key_index_in_json_bytes(data, json_key)
-	json_key_end_index = json_key_start_index + len(json_key)
+	key_start, key_end = utils.find_outermost_key_in_json_bytes(data, key)
 
-	if json_key_start_index == -1:
+	if key_end == -1:
 		raise KeyError(f"Key \"{key}\" not found in db \"{db_name}\"")
 
 	# Key found, now determine the bounds of the value
-	space_after_semicolon = 1 if data[json_key_end_index] == byte_codes.SPACE else 0
-	value_start_index = json_key_end_index + space_after_semicolon
-	value_end_index = utils.seek_index_through_value_bytes(data, value_start_index)
+	space_after_semicolon = 1 if data[key_end] == byte_codes.SPACE else 0
+	value_start = key_end + space_after_semicolon
+	value_end = utils.seek_index_through_value_bytes(data, value_start)
 
-	indent_level, indent_with  = utils.detect_indentation_in_json_bytes(data, json_key_start_index)
-	partial_bytes = data[value_start_index:value_end_index]
+	indent_level, indent_with  = utils.detect_indentation_in_json_bytes(data, key_start)
+	partial_bytes = data[value_start:value_end]
 
 	# Write key info to index file
-	write_index_file(
-		index_data,
-		db_name,
-		key,
-		value_start_index,
-		value_end_index,
-		indent_level,
-		indent_with,
-		hashlib.sha256(partial_bytes).hexdigest()
-	)
 
 	partial_value = orjson.loads(partial_bytes)
-	if not as_handle:
-		return partial_value
-
-	partial_dict = PartialDict(data[:value_start_index], key, partial_value, data[value_end_index:])
-	return PartialFileHandle(db_name, partial_dict, indent_level, indent_with, index_data)
+	partial_dict = PartialDict(data[:value_start], key, partial_value, data[value_end:])
+	return PartialFileHandle(db_name, partial_dict, indent_level, indent_with, indexer)
 
 
 ################################################################################
 #### Writing
 ################################################################################
 
 
-def write_bytes(db_name: str, dump: bytes):
-	"""
-		Write the bytes to the file of the db_path.
-		If the db was compressed but now config.use_compression is False,
-		remove the compressed file, and vice versa.
-	"""
-	json_path, json_exists, ddb_path, ddb_exists = utils.db_paths(db_name)
-	# Write bytes or string to file
-	if config.use_compression:
-		write_path = ddb_path
-		if json_exists:
-			os.remove(json_path)
-	else:
-		write_path = json_path
-		if ddb_exists:
-			os.remove(ddb_path)
-
-	# Compress if required
-	if config.use_compression:
-		dump = zlib.compress(dump, 1)
-
-	# Write bytes or string to file
-	with open(write_path, "wb") as f:
-		f.write(dump)
-
-
 def write(db_name: str, data: dict):
 	"""
 		Write the dict db dumped as a json string
@@ -183,8 +149,7 @@ def write(db_name: str, data: dict):
 	else:
 		db_dump = json.dumps(data, indent=config.indent, sort_keys=config.sort_keys)
 		db_dump = db_dump.encode()
-
-	write_bytes(db_name, db_dump)
+	io_bytes.write(db_name, db_dump)
 
 
 def partial_write(pf: PartialFileHandle):
@@ -199,20 +164,15 @@ def partial_write(pf: PartialFileHandle):
 	else:
 		partial_dump = json.dumps(pf.partial_dict.value, indent=config.indent, sort_keys=config.sort_keys)
 		partial_dump = partial_dump.encode()
+
+	# Add indentation
 	if pf.indent_level > 0 and pf.indent_with:
 		replace_this = "\n".encode()
 		replace_with = ("\n" + (pf.indent_level * pf.indent_with)).encode()
 		partial_dump = partial_dump.replace(replace_this, replace_with)
 
-	write_index_file(
-		pf.index_data,
-		pf.db_name,
-		pf.partial_dict.key,
-		len(pf.partial_dict.prefix),
-		len(pf.partial_dict.prefix) + len(partial_dump),
-		pf.indent_level,
-		pf.indent_with,
-		hashlib.sha256(partial_dump).hexdigest()
+	pf.indexer.write(pf.partial_dict.key, len(pf.partial_dict.prefix),
+		len(pf.partial_dict.prefix) + len(partial_dump), pf.indent_level,
+		pf.indent_with, hashlib.sha256(partial_dump).hexdigest()
 	)
-
-	write_bytes(pf.db_name, pf.partial_dict.prefix + partial_dump + pf.partial_dict.suffix)
+	io_bytes.write(pf.db_name, pf.partial_dict.prefix + partial_dump + pf.partial_dict.suffix)
diff --git a/dictdatabase/session.py b/dictdatabase/session.py
@@ -53,7 +53,7 @@ def __enter__(self) -> Tuple["DDBSession", JSONSerializable | T]:
 			elif self.op_type.file_key:
 				self.write_lock = locking.WriteLock(self.db_name)
 				self.write_lock._lock()
-				self.partial_handle = io_unsafe.partial_read(self.db_name, self.key, as_handle=True)
+				self.partial_handle = io_unsafe.get_partial_file_handle(self.db_name, self.key)
 				self.data_handle = self.partial_handle.partial_dict.value
 
 			elif self.op_type.file_where: