From 467690925851776b5ee2925163f804b873419ca1 Mon Sep 17 00:00:00 2001
From: David Linke <dr.david.linke@gmail.com>
Date: Wed, 29 Jan 2025 23:51:44 +0100
Subject: [PATCH] Read & write multi-IRIs cells with qualifier

---
 src/voc4cat/convert.py     | 40 ++++++++++++----------
 src/voc4cat/convert_043.py | 27 +++++++++++----
 src/voc4cat/models.py      | 68 +++++++++++++-------------------------
 3 files changed, 66 insertions(+), 69 deletions(-)

diff --git a/src/voc4cat/convert.py b/src/voc4cat/convert.py
index d82266e..d55c9f7 100644
--- a/src/voc4cat/convert.py
+++ b/src/voc4cat/convert.py
@@ -1,3 +1,4 @@
+from collections import defaultdict
 import logging
 from itertools import chain
 from pathlib import Path
@@ -254,7 +255,7 @@ def rdf_to_excel(
     for s, o in g.subject_objects(SKOS.broader):
         g.add((o, SKOS.narrower, s))
 
-    row_no_concepts, row_no_features = 3, 3
+    concepts_by_iri = defaultdict(dict)
     for s in g.subjects(RDF.type, SKOS.Concept):
         holder = {
             "uri": str(s),
@@ -303,10 +304,11 @@ def rdf_to_excel(
             elif p == SKOS.broadMatch:
                 holder["broad_match"].append(str(o))
 
-        row_no_concepts = models.Concept(
+        lang_code = holder["pl_language_code"]
+        concepts_by_iri[holder["uri"]][lang_code[0]] = models.Concept(
             uri=holder["uri"],
             pref_label=holder["pref_label"],
-            pl_language_code=holder["pl_language_code"],
+            pl_language_code=lang_code,
             definition=holder["definition"],
             def_language_code=holder["def_language_code"],
             children=holder["children"],
@@ -319,19 +321,23 @@ def rdf_to_excel(
             narrow_match=holder["narrow_match"],
             broad_match=holder["broad_match"],
             vocab_name=vocab_name,
-        ).to_excel(wb, row_no_concepts, row_no_features)
-
-        # only go to next row in "Additional Concepts Features" if there are any mappings
-        if any(
-            [
-                holder["related_match"],
-                holder["close_match"],
-                holder["exact_match"],
-                holder["narrow_match"],
-                holder["broad_match"],
-            ]
-        ):
-            row_no_features += 1
+        )
+
+    row_no_concepts, row_no_features = 3, 3
+    for con in list(concepts_by_iri.values()):
+        for lang in con.keys():
+            row_no_concepts = con[lang].to_excel(wb, row_no_concepts, row_no_features, concepts_by_iri)
+            # only go to next row in "Additional Concepts Features" if there are any mappings
+            if any(
+                [
+                    holder["related_match"],
+                    holder["close_match"],
+                    holder["exact_match"],
+                    holder["narrow_match"],
+                    holder["broad_match"],
+                ]
+            ):
+                row_no_features += 1
 
     row_no = 3
 
@@ -364,7 +370,7 @@ def rdf_to_excel(
                 holder["provenance"] if holder.get("provenance") is not None else None
             ),
             vocab_name=vocab_name,
-        ).to_excel(wb, row_no)
+        ).to_excel(wb, row_no, concepts_by_iri)
         row_no += 1
 
     # Write the prefix_map used in the conversion to the prefix sheet.
diff --git a/src/voc4cat/convert_043.py b/src/voc4cat/convert_043.py
index ecad3aa..09e2788 100644
--- a/src/voc4cat/convert_043.py
+++ b/src/voc4cat/convert_043.py
@@ -54,6 +54,19 @@ def write_prefix_sheet(wb: Workbook, prefix_map):
         ws.append([prefix, iri])
 
 
+def split_multi_iri(cell_value: str|None, prefix_converter: Converter) -> list[str]:
+    """
+    Split a string of IRIs separated by a comma into a list of IRIs
+    """
+    if cell_value is None:
+        return []
+    iris_nomalised = []
+    for line in cell_value.split(","):
+        iri = line.split()[0].strip()
+        iris_nomalised.append(prefix_converter.expand(iri) or iri)
+    return iris_nomalised
+
+
 def extract_concepts_and_collections(
     q: Worksheet,
     r: Worksheet,
@@ -80,7 +93,7 @@ def extract_concepts_and_collections(
                 "definition": q[f"D{row}"].value,
                 "def_language_code": split_and_tidy(q[f"E{row}"].value),
                 "alt_labels": split_and_tidy(q[f"F{row}"].value),
-                "children": q[f"G{row}"].value,
+                "children": split_multi_iri(q[f"G{row}"].value, prefix_converter),
                 "provenance": q[f"H{row}"].value,
                 # Note in the new template, source_vocab is synonymous with source vocab uri
                 "source_vocab": q[f"I{row}"].value,
@@ -103,11 +116,11 @@ def extract_concepts_and_collections(
                 raise ConversionError(msg)
             data = {
                 # additional concept features sheets
-                "related_match": r[f"B{row}"].value,
-                "close_match": r[f"C{row}"].value,
-                "exact_match": r[f"D{row}"].value,
-                "narrow_match": r[f"E{row}"].value,
-                "broad_match": r[f"F{row}"].value,
+                "related_match": split_multi_iri(r[f"B{row}"].value, prefix_converter),
+                "close_match": split_multi_iri(r[f"C{row}"].value, prefix_converter),
+                "exact_match": split_multi_iri(r[f"D{row}"].value, prefix_converter),
+                "narrow_match": split_multi_iri(r[f"E{row}"].value, prefix_converter),
+                "broad_match": split_multi_iri(r[f"F{row}"].value, prefix_converter),
                 "vocab_name": vocab_name,
             }
             concept_data[uri].update(**data)
@@ -137,7 +150,7 @@ def extract_concepts_and_collections(
                 "uri": s[f"A{row}"].value.split()[0].strip(),
                 "pref_label": s[f"B{row}"].value,
                 "definition": s[f"C{row}"].value,
-                "members": s[f"D{row}"].value,
+                "members": split_multi_iri(s[f"D{row}"].value, prefix_converter),
                 "provenance": s[f"E{row}"].value,
                 "vocab_name": vocab_name,
             }
diff --git a/src/voc4cat/models.py b/src/voc4cat/models.py
index d32edbf..e6a436a 100644
--- a/src/voc4cat/models.py
+++ b/src/voc4cat/models.py
@@ -61,6 +61,20 @@ def reset_curies(curies_map: dict) -> None:
     config.namespace_manager = namespace_manager
 
 
+def make_iri_qualifier_listing(item, concepts_by_iri):
+    """Return listing of item with one "uri (pref.label)" per row."""
+    child_lines = []
+    for uri in item:
+        uri_str = config.curies_converter.compress(uri, passthrough=True)
+        if "en" not in concepts_by_iri[uri]:
+            child_lines.append(f"{uri_str}")
+            continue
+        # we must be careful: not all concepts have all languages
+        pref_label_in_lang = concepts_by_iri[uri]["en"].pref_label[0]
+        child_lines.append(f"{uri_str} ({pref_label_in_lang})")
+    return ",\n".join(child_lines)
+
+
 # === Pydantic validators used by more than one model ===
 
 
@@ -375,7 +389,7 @@ def to_graph(self):
 
         return g
 
-    def to_excel(self, wb: Workbook, row_no_concepts: int, row_no_features: int) -> int:
+    def to_excel(self, wb: Workbook, row_no_concepts: int, row_no_features: int, concepts_by_iri:dict) -> int:
         """ "
         Export Concept to Excel using one row per language
 
@@ -426,13 +440,7 @@ def to_excel(self, wb: Workbook, row_no_concepts: int, row_no_features: int) ->
 
             first_row_exported = True
             ws[f"F{row_no_concepts}"] = ",\n".join(self.alt_labels)
-            ws[f"G{row_no_concepts}"] = ",\n".join(
-                [
-                    config.curies_converter.compress(uri, passthrough=True)
-                    for uri in self.children
-                    # TODO add pref_label of children but where to look up? Here we know just the current concept.
-                ]
-            )
+            ws[f"G{row_no_concepts}"] = make_iri_qualifier_listing(self.children, concepts_by_iri)
             ws[f"I{row_no_concepts}"] = (
                 config.curies_converter.compress(self.source_vocab, passthrough=True)
                 if self.source_vocab
@@ -456,36 +464,11 @@ def to_excel(self, wb: Workbook, row_no_concepts: int, row_no_features: int) ->
             ) + f" ({pref_labels.get('en', '')})"
             ws[f"A{row_no_features}"].hyperlink = self.uri
             ws[f"A{row_no_features}"].style = "Hyperlink"
-            ws[f"B{row_no_features}"] = ",\n".join(
-                [
-                    config.curies_converter.compress(uri, passthrough=True)
-                    for uri in self.related_match
-                ]
-            )
-            ws[f"C{row_no_features}"] = ",\n".join(
-                [
-                    config.curies_converter.compress(uri, passthrough=True)
-                    for uri in self.close_match
-                ]
-            )
-            ws[f"D{row_no_features}"] = ",\n".join(
-                [
-                    config.curies_converter.compress(uri, passthrough=True)
-                    for uri in self.exact_match
-                ]
-            )
-            ws[f"E{row_no_features}"] = ",\n".join(
-                [
-                    config.curies_converter.compress(uri, passthrough=True)
-                    for uri in self.narrow_match
-                ]
-            )
-            ws[f"F{row_no_features}"] = ",\n".join(
-                [
-                    config.curies_converter.compress(uri, passthrough=True)
-                    for uri in self.broad_match
-                ]
-            )
+            ws[f"B{row_no_features}"] = make_iri_qualifier_listing(self.related_match, concepts_by_iri)
+            ws[f"C{row_no_features}"] = make_iri_qualifier_listing(self.close_match, concepts_by_iri)
+            ws[f"D{row_no_features}"] = make_iri_qualifier_listing(self.exact_match, concepts_by_iri)
+            ws[f"E{row_no_features}"] = make_iri_qualifier_listing(self.narrow_match, concepts_by_iri)
+            ws[f"F{row_no_features}"] = make_iri_qualifier_listing(self.broad_match, concepts_by_iri)
 
         return row_no_concepts
 
@@ -529,7 +512,7 @@ def to_graph(self, cs):
 
         return g
 
-    def to_excel(self, wb: Workbook, row_no: int):
+    def to_excel(self, wb: Workbook, row_no: int, concepts_by_iri:dict) -> None:
         ws = wb["Collections"]
         ws[f"A{row_no}"].value = config.curies_converter.compress(
                 self.uri, passthrough=True
@@ -538,12 +521,7 @@ def to_excel(self, wb: Workbook, row_no: int):
         ws[f"A{row_no}"].style = "Hyperlink"
         ws[f"B{row_no}"] = self.pref_label
         ws[f"C{row_no}"] = self.definition
-        ws[f"D{row_no}"] = ",\n".join(
-            [
-                config.curies_converter.compress(uri, passthrough=True)
-                for uri in self.members
-            ]
-        )
+        ws[f"D{row_no}"] = make_iri_qualifier_listing(self.members, concepts_by_iri)
         ws[f"E{row_no}"] = self.provenance