Skip to content

Commit

Permalink
Read & write multi-IRIs cells with qualifier
Browse files Browse the repository at this point in the history
  • Loading branch information
dalito committed Jan 29, 2025
1 parent 3a349e9 commit 4676909
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 69 deletions.
40 changes: 23 additions & 17 deletions src/voc4cat/convert.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from collections import defaultdict
import logging
from itertools import chain
from pathlib import Path
Expand Down Expand Up @@ -254,7 +255,7 @@ def rdf_to_excel(
for s, o in g.subject_objects(SKOS.broader):
g.add((o, SKOS.narrower, s))

row_no_concepts, row_no_features = 3, 3
concepts_by_iri = defaultdict(dict)
for s in g.subjects(RDF.type, SKOS.Concept):
holder = {
"uri": str(s),
Expand Down Expand Up @@ -303,10 +304,11 @@ def rdf_to_excel(
elif p == SKOS.broadMatch:
holder["broad_match"].append(str(o))

row_no_concepts = models.Concept(
lang_code = holder["pl_language_code"]
concepts_by_iri[holder["uri"]][lang_code[0]] = models.Concept(
uri=holder["uri"],
pref_label=holder["pref_label"],
pl_language_code=holder["pl_language_code"],
pl_language_code=lang_code,
definition=holder["definition"],
def_language_code=holder["def_language_code"],
children=holder["children"],
Expand All @@ -319,19 +321,23 @@ def rdf_to_excel(
narrow_match=holder["narrow_match"],
broad_match=holder["broad_match"],
vocab_name=vocab_name,
).to_excel(wb, row_no_concepts, row_no_features)

# only go to next row in "Additional Concepts Features" if there are any mappings
if any(
[
holder["related_match"],
holder["close_match"],
holder["exact_match"],
holder["narrow_match"],
holder["broad_match"],
]
):
row_no_features += 1
)

row_no_concepts, row_no_features = 3, 3
for con in list(concepts_by_iri.values()):
for lang in con.keys():
row_no_concepts = con[lang].to_excel(wb, row_no_concepts, row_no_features, concepts_by_iri)
# only go to next row in "Additional Concepts Features" if there are any mappings
if any(
[
holder["related_match"],
holder["close_match"],
holder["exact_match"],
holder["narrow_match"],
holder["broad_match"],
]
):
row_no_features += 1

row_no = 3

Expand Down Expand Up @@ -364,7 +370,7 @@ def rdf_to_excel(
holder["provenance"] if holder.get("provenance") is not None else None
),
vocab_name=vocab_name,
).to_excel(wb, row_no)
).to_excel(wb, row_no, concepts_by_iri)
row_no += 1

# Write the prefix_map used in the conversion to the prefix sheet.
Expand Down
27 changes: 20 additions & 7 deletions src/voc4cat/convert_043.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,19 @@ def write_prefix_sheet(wb: Workbook, prefix_map):
ws.append([prefix, iri])


def split_multi_iri(cell_value: str|None, prefix_converter: Converter) -> list[str]:
"""
Split a string of IRIs separated by a comma into a list of IRIs
"""
if cell_value is None:
return []
iris_nomalised = []
for line in cell_value.split(","):
iri = line.split()[0].strip()
iris_nomalised.append(prefix_converter.expand(iri) or iri)
return iris_nomalised


def extract_concepts_and_collections(
q: Worksheet,
r: Worksheet,
Expand All @@ -80,7 +93,7 @@ def extract_concepts_and_collections(
"definition": q[f"D{row}"].value,
"def_language_code": split_and_tidy(q[f"E{row}"].value),
"alt_labels": split_and_tidy(q[f"F{row}"].value),
"children": q[f"G{row}"].value,
"children": split_multi_iri(q[f"G{row}"].value, prefix_converter),
"provenance": q[f"H{row}"].value,
# Note in the new template, source_vocab is synonymous with source vocab uri
"source_vocab": q[f"I{row}"].value,
Expand All @@ -103,11 +116,11 @@ def extract_concepts_and_collections(
raise ConversionError(msg)
data = {
# additional concept features sheets
"related_match": r[f"B{row}"].value,
"close_match": r[f"C{row}"].value,
"exact_match": r[f"D{row}"].value,
"narrow_match": r[f"E{row}"].value,
"broad_match": r[f"F{row}"].value,
"related_match": split_multi_iri(r[f"B{row}"].value, prefix_converter),
"close_match": split_multi_iri(r[f"C{row}"].value, prefix_converter),
"exact_match": split_multi_iri(r[f"D{row}"].value, prefix_converter),
"narrow_match": split_multi_iri(r[f"E{row}"].value, prefix_converter),
"broad_match": split_multi_iri(r[f"F{row}"].value, prefix_converter),
"vocab_name": vocab_name,
}
concept_data[uri].update(**data)
Expand Down Expand Up @@ -137,7 +150,7 @@ def extract_concepts_and_collections(
"uri": s[f"A{row}"].value.split()[0].strip(),
"pref_label": s[f"B{row}"].value,
"definition": s[f"C{row}"].value,
"members": s[f"D{row}"].value,
"members": split_multi_iri(s[f"D{row}"].value, prefix_converter),
"provenance": s[f"E{row}"].value,
"vocab_name": vocab_name,
}
Expand Down
68 changes: 23 additions & 45 deletions src/voc4cat/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,20 @@ def reset_curies(curies_map: dict) -> None:
config.namespace_manager = namespace_manager


def make_iri_qualifier_listing(item, concepts_by_iri):
"""Return listing of item with one "uri (pref.label)" per row."""
child_lines = []
for uri in item:
uri_str = config.curies_converter.compress(uri, passthrough=True)
if "en" not in concepts_by_iri[uri]:
child_lines.append(f"{uri_str}")
continue
# we must be careful: not all concepts have all languages
pref_label_in_lang = concepts_by_iri[uri]["en"].pref_label[0]
child_lines.append(f"{uri_str} ({pref_label_in_lang})")
return ",\n".join(child_lines)


# === Pydantic validators used by more than one model ===


Expand Down Expand Up @@ -375,7 +389,7 @@ def to_graph(self):

return g

def to_excel(self, wb: Workbook, row_no_concepts: int, row_no_features: int) -> int:
def to_excel(self, wb: Workbook, row_no_concepts: int, row_no_features: int, concepts_by_iri:dict) -> int:
""" "
Export Concept to Excel using one row per language
Expand Down Expand Up @@ -426,13 +440,7 @@ def to_excel(self, wb: Workbook, row_no_concepts: int, row_no_features: int) ->

first_row_exported = True
ws[f"F{row_no_concepts}"] = ",\n".join(self.alt_labels)
ws[f"G{row_no_concepts}"] = ",\n".join(
[
config.curies_converter.compress(uri, passthrough=True)
for uri in self.children
# TODO add pref_label of children but where to look up? Here we know just the current concept.
]
)
ws[f"G{row_no_concepts}"] = make_iri_qualifier_listing(self.children, concepts_by_iri)
ws[f"I{row_no_concepts}"] = (
config.curies_converter.compress(self.source_vocab, passthrough=True)
if self.source_vocab
Expand All @@ -456,36 +464,11 @@ def to_excel(self, wb: Workbook, row_no_concepts: int, row_no_features: int) ->
) + f" ({pref_labels.get('en', '')})"
ws[f"A{row_no_features}"].hyperlink = self.uri
ws[f"A{row_no_features}"].style = "Hyperlink"
ws[f"B{row_no_features}"] = ",\n".join(
[
config.curies_converter.compress(uri, passthrough=True)
for uri in self.related_match
]
)
ws[f"C{row_no_features}"] = ",\n".join(
[
config.curies_converter.compress(uri, passthrough=True)
for uri in self.close_match
]
)
ws[f"D{row_no_features}"] = ",\n".join(
[
config.curies_converter.compress(uri, passthrough=True)
for uri in self.exact_match
]
)
ws[f"E{row_no_features}"] = ",\n".join(
[
config.curies_converter.compress(uri, passthrough=True)
for uri in self.narrow_match
]
)
ws[f"F{row_no_features}"] = ",\n".join(
[
config.curies_converter.compress(uri, passthrough=True)
for uri in self.broad_match
]
)
ws[f"B{row_no_features}"] = make_iri_qualifier_listing(self.related_match, concepts_by_iri)
ws[f"C{row_no_features}"] = make_iri_qualifier_listing(self.close_match, concepts_by_iri)
ws[f"D{row_no_features}"] = make_iri_qualifier_listing(self.exact_match, concepts_by_iri)
ws[f"E{row_no_features}"] = make_iri_qualifier_listing(self.narrow_match, concepts_by_iri)
ws[f"F{row_no_features}"] = make_iri_qualifier_listing(self.broad_match, concepts_by_iri)

return row_no_concepts

Expand Down Expand Up @@ -529,7 +512,7 @@ def to_graph(self, cs):

return g

def to_excel(self, wb: Workbook, row_no: int):
def to_excel(self, wb: Workbook, row_no: int, concepts_by_iri:dict) -> None:
ws = wb["Collections"]
ws[f"A{row_no}"].value = config.curies_converter.compress(
self.uri, passthrough=True
Expand All @@ -538,12 +521,7 @@ def to_excel(self, wb: Workbook, row_no: int):
ws[f"A{row_no}"].style = "Hyperlink"
ws[f"B{row_no}"] = self.pref_label
ws[f"C{row_no}"] = self.definition
ws[f"D{row_no}"] = ",\n".join(
[
config.curies_converter.compress(uri, passthrough=True)
for uri in self.members
]
)
ws[f"D{row_no}"] = make_iri_qualifier_listing(self.members, concepts_by_iri)
ws[f"E{row_no}"] = self.provenance


Expand Down

0 comments on commit 4676909

Please sign in to comment.