diff --git a/pyproject.toml b/pyproject.toml index 503fa81..5605891 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "digiflow" -version = "5.5.8" +version = "5.5.9" description = "Father's Little Digitization Workflow Helper" readme = "README.md" requires-python = ">=3.8" diff --git a/src/digiflow/record/record_handler.py b/src/digiflow/record/record_handler.py index 7f50081..6030c67 100644 --- a/src/digiflow/record/record_handler.py +++ b/src/digiflow/record/record_handler.py @@ -11,6 +11,7 @@ RECORD_STATE_MASK_FRAME = 'other_load' SETSPEC_SPLITTER = '##' +STRING_QUOTES = "\"'" class RecordHandlerException(Exception): @@ -375,13 +376,23 @@ def _merge(self_record, other_record): self_record[df_r.FIELD_STATE] = other_record[df_r.FIELD_STATE] self_record[df_r.FIELD_STATETIME] = other_record[df_r.FIELD_STATETIME] try: - self_info = ast.literal_eval(self_record[df_r.FIELD_INFO]) - other_info = ast.literal_eval(other_record[df_r.FIELD_INFO]) + self_info = ast.literal_eval(_clear_trailing_quotes(self_record[df_r.FIELD_INFO])) + other_info = ast.literal_eval(_clear_trailing_quotes(other_record[df_r.FIELD_INFO])) self_info.update(other_info) self_record[df_r.FIELD_INFO] = str(self_info) except (SyntaxError, ValueError): self_record[df_r.FIELD_INFO] = other_record[df_r.FIELD_INFO] +def _clear_trailing_quotes(raw_string:str): + """Remove evil trailing chars like double/single + quotation marks""" + + if raw_string[0] in STRING_QUOTES: + raw_string = raw_string[1:] + if raw_string[-1] in STRING_QUOTES: + raw_string = raw_string[:-1] + return raw_string + def _is_unset(self_record): if self_record[df_r.FIELD_STATE] == df_r.UNSET_LABEL: diff --git a/tests/test_digiflow_record_handler.py b/tests/test_digiflow_record_handler.py index 9cee852..8c57a5a 100644 --- a/tests/test_digiflow_record_handler.py +++ b/tests/test_digiflow_record_handler.py @@ -806,7 +806,7 @@ def test_record_handler_merge_info_dicts(tmp_path): Please note: This merge will only work if both INFO fields - can the evaluated to dictionaries! + can evaluate to dictionaries! """ # arrange @@ -840,4 +840,39 @@ def test_record_handler_merge_info_dicts(tmp_path): assert results['ignores'] == 0 assert results['appendeds'] == 0 assert dst_hndlr.total_len == 2 - assert merged_record.info == {'n_ocr': 20, 'pages': 23, 'ods_created':'1984-10-03'} + assert merged_record.info == {'n_ocr': 20, 'pages': 23, 'ods_created': '1984-10-03'} + + +def test_record_handler_merge_write_read(tmp_path): + """Two lists info field merged too but managed + to handle quotations around the info-string. + """ + + # arrange + path_oai_list_a = tmp_path / 'oai_list_a' + data_fresh = [ + "123\tn.a.\t2015-08-25T20:00:35Z\t{'pages':23, 'ods_created':'1984-10-03'}\tu.a.\tn.a.\n" + ] + write_datalist(path_oai_list_a, data_fresh, LEGACY_HEADER_STR) + dst_hndlr = df_r.RecordHandler( + path_oai_list_a, + data_fields=df_r.LEGACY_HEADER, + transform_func=df_r.row_to_record) + + list_merge = tmp_path / 'oai_list_b' + data2 = [ + "123\tn.a.\t2015-08-25T20:00:35Z\t\"{'xml_invalid': \"Element 'mods:subtitle': This element is not expected.\"}\"\tocr_done\t2024-10-18_11:12:00\n", + ] + write_datalist(list_merge, data2, LEGACY_HEADER_STR) + + # act + dst_hndlr.merges(list_merge, dry_run=False) + new_hndlr = df_r.RecordHandler(path_oai_list_a, + data_fields=df_r.LEGACY_HEADER, + transform_func=df_r.row_to_record) + + # assert + tha_record: df_r.Record = new_hndlr.next_record(state='ocr_done') + assert tha_record.info == {'pages': 23, + 'ods_created': '1984-10-03', + 'xml_invalid': "Element 'mods:subtitle': This element is not expected."}