Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: always write with utf8 encoding #111

Merged
merged 1 commit into from
Dec 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docling_core/cli/view.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def view(
doc = DoclingDocument.load_from_json(filename=path)
target_path = Path(tempfile.mkdtemp()) / "out.html"
html_output = doc.export_to_html(image_mode=ImageRefMode.EMBEDDED)
with open(target_path, "w") as f:
with open(target_path, "w", encoding="utf-8") as f:
f.write(html_output)
webbrowser.open(url=f"file://{target_path.absolute().resolve()}")

Expand Down
12 changes: 6 additions & 6 deletions docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -1884,7 +1884,7 @@ def save_as_json(
)

out = new_doc.export_to_dict()
with open(filename, "w") as fw:
with open(filename, "w", encoding="utf-8") as fw:
json.dump(out, fw, indent=indent)

@classmethod
Expand All @@ -1898,7 +1898,7 @@ def load_from_json(cls, filename: Path) -> "DoclingDocument":
:rtype: DoclingDocument

"""
with open(filename, "r") as f:
with open(filename, "r", encoding="utf-8") as f:
return cls.model_validate_json(f.read())

def save_as_yaml(
Expand All @@ -1919,7 +1919,7 @@ def save_as_yaml(
)

out = new_doc.export_to_dict()
with open(filename, "w") as fw:
with open(filename, "w", encoding="utf-8") as fw:
yaml.dump(out, fw, default_flow_style=default_flow_style)

def export_to_dict(
Expand Down Expand Up @@ -1971,7 +1971,7 @@ def save_as_markdown(
page_no=page_no,
)

with open(filename, "w") as fw:
with open(filename, "w", encoding="utf-8") as fw:
fw.write(md_out)

def export_to_markdown( # noqa: C901
Expand Down Expand Up @@ -2224,7 +2224,7 @@ def save_as_html(
html_head=html_head,
)

with open(filename, "w") as fw:
with open(filename, "w", encoding="utf-8") as fw:
fw.write(html_out)

def _get_output_paths(
Expand Down Expand Up @@ -2462,7 +2462,7 @@ def save_as_document_tokens(
with_groups=with_groups,
)

with open(filename, "w") as fw:
with open(filename, "w", encoding="utf-8") as fw:
fw.write(out)

def export_to_document_tokens(
Expand Down
2 changes: 1 addition & 1 deletion docling_core/utils/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def run():
"""Run the validation of a file containing a Document."""
file_format, input_file = parse_arguments()

with open(input_file, "r") as fd:
with open(input_file, "r", encoding="utf-8") as fd:
file_ = json.load(fd)

result = (False, "Empty result")
Expand Down
4 changes: 2 additions & 2 deletions test/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def test_identifier():
)

# schema_json(): no need to set by_alias since it is True by the default
tf = open("test/data/json_schemas/base_identifier.json")
tf = open("test/data/json_schemas/base_identifier.json", encoding="utf-8")
gold_json = json.load(tf)

assert Identifier.model_json_schema() == gold_json
Expand Down Expand Up @@ -104,7 +104,7 @@ def test_log():
== gold_dict
)

with open("test/data/json_schemas/base_log.json") as tf:
with open("test/data/json_schemas/base_log.json", encoding="utf-8") as tf:
gold_json_schema = json.load(tf)
assert Log.model_json_schema() == gold_json_schema

Expand Down
32 changes: 19 additions & 13 deletions test/test_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def test_generic():
def test_document():
"""Test the Document model."""
for filename in glob.glob("test/data/legacy_doc/doc-*.json"):
with open(filename) as file_obj:
with open(filename, encoding="utf-8") as file_obj:
file_json = file_obj.read()
Document.model_validate_json(file_json)

Expand All @@ -54,7 +54,7 @@ def test_table_export_to_tokens():
"""Test the Table Tokens export."""

for filename in glob.glob("test/data/legacy_doc/doc-*.json"):
with open(filename) as file_obj:
with open(filename, encoding="utf-8") as file_obj:
file_json = file_obj.read()

doc = Document.model_validate_json(file_json)
Expand All @@ -73,10 +73,10 @@ def test_table_export_to_tokens():
fname = f"{filename}_table_{i}.doctags.txt"
if GENERATE:
print(f"writing {fname}")
with open(fname, "w") as gold_obj:
with open(fname, "w", encoding="utf-8") as gold_obj:
gold_obj.write(out)

with open(fname, "r") as gold_obj:
with open(fname, "r", encoding="utf-8") as gold_obj:
gold_data = gold_obj.read()

assert out == gold_data
Expand All @@ -96,10 +96,10 @@ def test_table_export_to_tokens():
fname = f"{filename}_table_{i}.doctags.txt"
if GENERATE:
print(f"writing {fname}")
with open(fname, "w") as gold_obj:
with open(fname, "w", encoding="utf-8") as gold_obj:
gold_obj.write(out)

with open(fname, "r") as gold_obj:
with open(fname, "r", encoding="utf-8") as gold_obj:
gold_data = gold_obj.read()

assert out == gold_data
Expand All @@ -110,35 +110,41 @@ def test_table_export_to_tokens():

def test_document_export_to_md():
"""Test the Document Markdown export."""
with open("test/data/legacy_doc/doc-export.json") as src_obj:
with open("test/data/legacy_doc/doc-export.json", encoding="utf-8") as src_obj:
src_data = src_obj.read()
doc = Document.model_validate_json(src_data)

md = doc.export_to_markdown()

if GENERATE:
with open("test/data/legacy_doc/doc-export.md", "w") as gold_obj:
with open(
"test/data/legacy_doc/doc-export.md", "w", encoding="utf-8"
) as gold_obj:
gold_obj.write(md)

with open("test/data/legacy_doc/doc-export.md") as gold_obj:
with open("test/data/legacy_doc/doc-export.md", encoding="utf-8") as gold_obj:
gold_data = gold_obj.read().strip()

assert md == gold_data


def test_document_export_to_tokens():
"""Test the Document Tokens export."""
with open("test/data/legacy_doc/doc-export.json") as src_obj:
with open("test/data/legacy_doc/doc-export.json", encoding="utf-8") as src_obj:
src_data = src_obj.read()

doc = Document.model_validate_json(src_data)
xml = doc.export_to_document_tokens(delim=True)

if GENERATE:
with open("test/data/legacy_doc/doc-export.doctags.txt", "w") as gold_obj:
with open(
"test/data/legacy_doc/doc-export.doctags.txt", "w", encoding="utf-8"
) as gold_obj:
gold_obj.write(xml)

with open("test/data/legacy_doc/doc-export.doctags.txt", "r") as gold_obj:
with open(
"test/data/legacy_doc/doc-export.doctags.txt", "r", encoding="utf-8"
) as gold_obj:
gold_data = gold_obj.read().strip()

assert xml == gold_data
Expand All @@ -147,6 +153,6 @@ def test_document_export_to_tokens():
def test_record():
"""Test the Document model."""
for filename in glob.glob("test/data/rec/record-*.json"):
with open(filename) as file_obj:
with open(filename, encoding="utf-8") as file_obj:
file_json = file_obj.read()
Record.model_validate_json(file_json)
2 changes: 1 addition & 1 deletion test/test_doc_legacy_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
def test_new_to_old():
filename = "test/data/doc/2206.01062.yaml"

with open(filename, "r") as fp:
with open(filename, "r", encoding="utf-8") as fp:
dict_from_yaml = yaml.safe_load(fp)

doc = DoclingDocument.model_validate(dict_from_yaml)
Expand Down
12 changes: 6 additions & 6 deletions test/test_doc_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
def test_ccs_document():
"""Validate data with CCSDocument schema."""
for filename in glob.glob("test/data/legacy_doc/doc-*.json"):
with open(filename) as file_obj:
with open(filename, encoding="utf-8") as file_obj:
file_json = file_obj.read()
try:
# do not pass strict=True, since date input values are not an instance of datetime.
Expand All @@ -41,7 +41,7 @@ def test_ccs_document():

# check doc-error-1 is invalid in logs
try:
with open("test/data/legacy_doc/error-1.json") as file_obj:
with open("test/data/legacy_doc/error-1.json", encoding="utf-8") as file_obj:
file_json = file_obj.read()
CCSDocument.model_validate_json(file_json)
assert False, f"Data in file {filename} should be invalid for CCSDocument model"
Expand All @@ -55,15 +55,15 @@ def test_ccs_document():
# check doc-error-2 is invalid for missing page-hashes
with (
pytest.raises(ValidationError, match="page-hashes"),
open("test/data/legacy_doc/error-2.json") as file_obj,
open("test/data/legacy_doc/error-2.json", encoding="utf-8") as file_obj,
):
file_json = file_obj.read()
CCSDocument.model_validate_json(file_json)

# check doc-error-3 is invalid for wrong types in citation_count and reference_count
with (
pytest.raises(ValidationError, match="count"),
open("test/data/legacy_doc/error-3.json") as file_obj,
open("test/data/legacy_doc/error-3.json", encoding="utf-8") as file_obj,
):
file_json = file_obj.read()
CCSDocument.model_validate_json(file_json)
Expand All @@ -72,7 +72,7 @@ def test_ccs_document():
def test_publication_journal():
""" "Validate data with Publication model."""
for filename in glob.glob("test/data/legacy_doc/intermediates/publication_*.json"):
with open(filename) as file_obj:
with open(filename, encoding="utf-8") as file_obj:
file_json = file_obj.read()
file_dict = json.loads(file_json)
try:
Expand All @@ -85,7 +85,7 @@ def test_publication_journal():
def test_description_advanced_t():
"""Validate data with different DescriptionAdvancedT instances."""
# without description.advanced
with open("test/data/legacy_doc/doc-5.json") as file_obj:
with open("test/data/legacy_doc/doc-5.json", encoding="utf-8") as file_obj:
desc = json.load(file_obj)["description"]

# without advanced
Expand Down
2 changes: 1 addition & 1 deletion test/test_doc_schema_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def test_ccs_document_update():
"""Validate data with CCSDocument extract."""
filename = "test/data/legacy_doc/ext-1.json"
try:
with open(filename) as f:
with open(filename, encoding="utf-8") as f:
raw_doc = json.load(f)
for item in raw_doc["main-text"]:
if "$ref" in item:
Expand Down
24 changes: 14 additions & 10 deletions test/test_docling_doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,15 @@ def serialise(obj):
return yaml.safe_dump(obj.model_dump(mode="json", by_alias=True))

def write(name: str, serialisation: str):
with open(f"./test/data/docling_document/unit/{name}.yaml", "w") as fw:
with open(
f"./test/data/docling_document/unit/{name}.yaml", "w", encoding="utf-8"
) as fw:
fw.write(serialisation)

def read(name: str):
with open(f"./test/data/docling_document/unit/{name}.yaml", "r") as fr:
with open(
f"./test/data/docling_document/unit/{name}.yaml", "r", encoding="utf-8"
) as fr:
gold = fr.read()
return gold

Expand Down Expand Up @@ -146,7 +150,7 @@ def test_reference_doc():
filename = "test/data/doc/dummy_doc.yaml"

# Read YAML file of manual reference doc
with open(filename, "r") as fp:
with open(filename, "r", encoding="utf-8") as fp:
dict_from_yaml = yaml.safe_load(fp)

doc = DoclingDocument.model_validate(dict_from_yaml)
Expand Down Expand Up @@ -186,7 +190,7 @@ def test_parse_doc():

filename = "test/data/doc/2206.01062.yaml"

with open(filename, "r") as fp:
with open(filename, "r", encoding="utf-8") as fp:
dict_from_yaml = yaml.safe_load(fp)

doc = DoclingDocument.model_validate(dict_from_yaml)
Expand Down Expand Up @@ -244,12 +248,12 @@ def _test_serialize_and_reload(doc):
def _verify_regression_test(pred: str, filename: str, ext: str):

if os.path.exists(filename + f".{ext}") and not GENERATE:
with open(filename + f".{ext}", "r") as fr:
with open(filename + f".{ext}", "r", encoding="utf-8") as fr:
gt_true = fr.read()

assert gt_true == pred, f"Does not pass regression-test for {filename}.{ext}"
else:
with open(filename + f".{ext}", "w") as fw:
with open(filename + f".{ext}", "w", encoding="utf-8") as fw:
fw.write(pred)


Expand Down Expand Up @@ -499,7 +503,7 @@ def test_version_doc():
doc = DoclingDocument(name="Untitled 1")
assert doc.version == CURRENT_VERSION

with open("test/data/doc/dummy_doc.yaml") as fp:
with open("test/data/doc/dummy_doc.yaml", encoding="utf-8") as fp:
dict_from_yaml = yaml.safe_load(fp)
doc = DoclingDocument.model_validate(dict_from_yaml)
assert doc.version == CURRENT_VERSION
Expand Down Expand Up @@ -674,17 +678,17 @@ def _normalise_string_wrt_filepaths(instr: str, paths: List[Path]):
def _verify_saved_output(filename: str, paths: List[Path]):

pred = ""
with open(filename, "r") as fr:
with open(filename, "r", encoding="utf-8") as fr:
pred = fr.read()

pred = _normalise_string_wrt_filepaths(pred, paths=paths)

if GENERATE:
with open(str(filename) + ".gt", "w") as fw:
with open(str(filename) + ".gt", "w", encoding="utf-8") as fw:
fw.write(pred)
else:
gt = ""
with open(str(filename) + ".gt", "r") as fr:
with open(str(filename) + ".gt", "r", encoding="utf-8") as fr:
gt = fr.read()

assert pred == gt, f"pred!=gt for {filename}"
Expand Down
8 changes: 4 additions & 4 deletions test/test_hierarchical_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@


def test_chunk_merge_list_items():
with open("test/data/chunker/0_inp_dl_doc.json") as f:
with open("test/data/chunker/0_inp_dl_doc.json", encoding="utf-8") as f:
data_json = f.read()
dl_doc = DLDocument.model_validate_json(data_json)
chunker = HierarchicalChunker(
Expand All @@ -21,13 +21,13 @@ def test_chunk_merge_list_items():
act_data = dict(
root=[DocChunk.model_validate(n).export_json_dict() for n in chunks]
)
with open("test/data/chunker/0_out_chunks.json") as f:
with open("test/data/chunker/0_out_chunks.json", encoding="utf-8") as f:
exp_data = json.load(fp=f)
assert exp_data == act_data


def test_chunk_no_merge_list_items():
with open("test/data/chunker/0_inp_dl_doc.json") as f:
with open("test/data/chunker/0_inp_dl_doc.json", encoding="utf-8") as f:
data_json = f.read()
dl_doc = DLDocument.model_validate_json(data_json)
chunker = HierarchicalChunker(
Expand All @@ -37,6 +37,6 @@ def test_chunk_no_merge_list_items():
act_data = dict(
root=[DocChunk.model_validate(n).export_json_dict() for n in chunks]
)
with open("test/data/chunker/1_out_chunks.json") as f:
with open("test/data/chunker/1_out_chunks.json", encoding="utf-8") as f:
exp_data = json.load(fp=f)
assert exp_data == act_data
Loading
Loading