From 467690925851776b5ee2925163f804b873419ca1 Mon Sep 17 00:00:00 2001 From: David Linke Date: Wed, 29 Jan 2025 23:51:44 +0100 Subject: [PATCH] Read & write multi-IRIs cells with qualifier --- src/voc4cat/convert.py | 40 ++++++++++++---------- src/voc4cat/convert_043.py | 27 +++++++++++---- src/voc4cat/models.py | 68 +++++++++++++------------------------- 3 files changed, 66 insertions(+), 69 deletions(-) diff --git a/src/voc4cat/convert.py b/src/voc4cat/convert.py index d82266e..d55c9f7 100644 --- a/src/voc4cat/convert.py +++ b/src/voc4cat/convert.py @@ -1,3 +1,4 @@ +from collections import defaultdict import logging from itertools import chain from pathlib import Path @@ -254,7 +255,7 @@ def rdf_to_excel( for s, o in g.subject_objects(SKOS.broader): g.add((o, SKOS.narrower, s)) - row_no_concepts, row_no_features = 3, 3 + concepts_by_iri = defaultdict(dict) for s in g.subjects(RDF.type, SKOS.Concept): holder = { "uri": str(s), @@ -303,10 +304,11 @@ def rdf_to_excel( elif p == SKOS.broadMatch: holder["broad_match"].append(str(o)) - row_no_concepts = models.Concept( + lang_code = holder["pl_language_code"] + concepts_by_iri[holder["uri"]][lang_code[0]] = models.Concept( uri=holder["uri"], pref_label=holder["pref_label"], - pl_language_code=holder["pl_language_code"], + pl_language_code=lang_code, definition=holder["definition"], def_language_code=holder["def_language_code"], children=holder["children"], @@ -319,19 +321,23 @@ def rdf_to_excel( narrow_match=holder["narrow_match"], broad_match=holder["broad_match"], vocab_name=vocab_name, - ).to_excel(wb, row_no_concepts, row_no_features) - - # only go to next row in "Additional Concepts Features" if there are any mappings - if any( - [ - holder["related_match"], - holder["close_match"], - holder["exact_match"], - holder["narrow_match"], - holder["broad_match"], - ] - ): - row_no_features += 1 + ) + + row_no_concepts, row_no_features = 3, 3 + for con in list(concepts_by_iri.values()): + for lang in con.keys(): + row_no_concepts = con[lang].to_excel(wb, row_no_concepts, row_no_features, concepts_by_iri) + # only go to next row in "Additional Concepts Features" if there are any mappings + if any( + [ + holder["related_match"], + holder["close_match"], + holder["exact_match"], + holder["narrow_match"], + holder["broad_match"], + ] + ): + row_no_features += 1 row_no = 3 @@ -364,7 +370,7 @@ def rdf_to_excel( holder["provenance"] if holder.get("provenance") is not None else None ), vocab_name=vocab_name, - ).to_excel(wb, row_no) + ).to_excel(wb, row_no, concepts_by_iri) row_no += 1 # Write the prefix_map used in the conversion to the prefix sheet. diff --git a/src/voc4cat/convert_043.py b/src/voc4cat/convert_043.py index ecad3aa..09e2788 100644 --- a/src/voc4cat/convert_043.py +++ b/src/voc4cat/convert_043.py @@ -54,6 +54,19 @@ def write_prefix_sheet(wb: Workbook, prefix_map): ws.append([prefix, iri]) +def split_multi_iri(cell_value: str|None, prefix_converter: Converter) -> list[str]: + """ + Split a string of IRIs separated by a comma into a list of IRIs + """ + if cell_value is None: + return [] + iris_nomalised = [] + for line in cell_value.split(","): + iri = line.split()[0].strip() + iris_nomalised.append(prefix_converter.expand(iri) or iri) + return iris_nomalised + + def extract_concepts_and_collections( q: Worksheet, r: Worksheet, @@ -80,7 +93,7 @@ def extract_concepts_and_collections( "definition": q[f"D{row}"].value, "def_language_code": split_and_tidy(q[f"E{row}"].value), "alt_labels": split_and_tidy(q[f"F{row}"].value), - "children": q[f"G{row}"].value, + "children": split_multi_iri(q[f"G{row}"].value, prefix_converter), "provenance": q[f"H{row}"].value, # Note in the new template, source_vocab is synonymous with source vocab uri "source_vocab": q[f"I{row}"].value, @@ -103,11 +116,11 @@ def extract_concepts_and_collections( raise ConversionError(msg) data = { # additional concept features sheets - "related_match": r[f"B{row}"].value, - "close_match": r[f"C{row}"].value, - "exact_match": r[f"D{row}"].value, - "narrow_match": r[f"E{row}"].value, - "broad_match": r[f"F{row}"].value, + "related_match": split_multi_iri(r[f"B{row}"].value, prefix_converter), + "close_match": split_multi_iri(r[f"C{row}"].value, prefix_converter), + "exact_match": split_multi_iri(r[f"D{row}"].value, prefix_converter), + "narrow_match": split_multi_iri(r[f"E{row}"].value, prefix_converter), + "broad_match": split_multi_iri(r[f"F{row}"].value, prefix_converter), "vocab_name": vocab_name, } concept_data[uri].update(**data) @@ -137,7 +150,7 @@ def extract_concepts_and_collections( "uri": s[f"A{row}"].value.split()[0].strip(), "pref_label": s[f"B{row}"].value, "definition": s[f"C{row}"].value, - "members": s[f"D{row}"].value, + "members": split_multi_iri(s[f"D{row}"].value, prefix_converter), "provenance": s[f"E{row}"].value, "vocab_name": vocab_name, } diff --git a/src/voc4cat/models.py b/src/voc4cat/models.py index d32edbf..e6a436a 100644 --- a/src/voc4cat/models.py +++ b/src/voc4cat/models.py @@ -61,6 +61,20 @@ def reset_curies(curies_map: dict) -> None: config.namespace_manager = namespace_manager +def make_iri_qualifier_listing(item, concepts_by_iri): + """Return listing of item with one "uri (pref.label)" per row.""" + child_lines = [] + for uri in item: + uri_str = config.curies_converter.compress(uri, passthrough=True) + if "en" not in concepts_by_iri[uri]: + child_lines.append(f"{uri_str}") + continue + # we must be careful: not all concepts have all languages + pref_label_in_lang = concepts_by_iri[uri]["en"].pref_label[0] + child_lines.append(f"{uri_str} ({pref_label_in_lang})") + return ",\n".join(child_lines) + + # === Pydantic validators used by more than one model === @@ -375,7 +389,7 @@ def to_graph(self): return g - def to_excel(self, wb: Workbook, row_no_concepts: int, row_no_features: int) -> int: + def to_excel(self, wb: Workbook, row_no_concepts: int, row_no_features: int, concepts_by_iri:dict) -> int: """ " Export Concept to Excel using one row per language @@ -426,13 +440,7 @@ def to_excel(self, wb: Workbook, row_no_concepts: int, row_no_features: int) -> first_row_exported = True ws[f"F{row_no_concepts}"] = ",\n".join(self.alt_labels) - ws[f"G{row_no_concepts}"] = ",\n".join( - [ - config.curies_converter.compress(uri, passthrough=True) - for uri in self.children - # TODO add pref_label of children but where to look up? Here we know just the current concept. - ] - ) + ws[f"G{row_no_concepts}"] = make_iri_qualifier_listing(self.children, concepts_by_iri) ws[f"I{row_no_concepts}"] = ( config.curies_converter.compress(self.source_vocab, passthrough=True) if self.source_vocab @@ -456,36 +464,11 @@ def to_excel(self, wb: Workbook, row_no_concepts: int, row_no_features: int) -> ) + f" ({pref_labels.get('en', '')})" ws[f"A{row_no_features}"].hyperlink = self.uri ws[f"A{row_no_features}"].style = "Hyperlink" - ws[f"B{row_no_features}"] = ",\n".join( - [ - config.curies_converter.compress(uri, passthrough=True) - for uri in self.related_match - ] - ) - ws[f"C{row_no_features}"] = ",\n".join( - [ - config.curies_converter.compress(uri, passthrough=True) - for uri in self.close_match - ] - ) - ws[f"D{row_no_features}"] = ",\n".join( - [ - config.curies_converter.compress(uri, passthrough=True) - for uri in self.exact_match - ] - ) - ws[f"E{row_no_features}"] = ",\n".join( - [ - config.curies_converter.compress(uri, passthrough=True) - for uri in self.narrow_match - ] - ) - ws[f"F{row_no_features}"] = ",\n".join( - [ - config.curies_converter.compress(uri, passthrough=True) - for uri in self.broad_match - ] - ) + ws[f"B{row_no_features}"] = make_iri_qualifier_listing(self.related_match, concepts_by_iri) + ws[f"C{row_no_features}"] = make_iri_qualifier_listing(self.close_match, concepts_by_iri) + ws[f"D{row_no_features}"] = make_iri_qualifier_listing(self.exact_match, concepts_by_iri) + ws[f"E{row_no_features}"] = make_iri_qualifier_listing(self.narrow_match, concepts_by_iri) + ws[f"F{row_no_features}"] = make_iri_qualifier_listing(self.broad_match, concepts_by_iri) return row_no_concepts @@ -529,7 +512,7 @@ def to_graph(self, cs): return g - def to_excel(self, wb: Workbook, row_no: int): + def to_excel(self, wb: Workbook, row_no: int, concepts_by_iri:dict) -> None: ws = wb["Collections"] ws[f"A{row_no}"].value = config.curies_converter.compress( self.uri, passthrough=True @@ -538,12 +521,7 @@ def to_excel(self, wb: Workbook, row_no: int): ws[f"A{row_no}"].style = "Hyperlink" ws[f"B{row_no}"] = self.pref_label ws[f"C{row_no}"] = self.definition - ws[f"D{row_no}"] = ",\n".join( - [ - config.curies_converter.compress(uri, passthrough=True) - for uri in self.members - ] - ) + ws[f"D{row_no}"] = make_iri_qualifier_listing(self.members, concepts_by_iri) ws[f"E{row_no}"] = self.provenance