Skip to content

Commit

Permalink
Merge branch 'master' of github.com:ckan/ckanext-dcat
Browse files Browse the repository at this point in the history
  • Loading branch information
amercader committed Oct 31, 2024
2 parents 9ba81ab + bf5cf51 commit e957890
Show file tree
Hide file tree
Showing 39 changed files with 7,266 additions and 132 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@ jobs:
pip install -r ckanext-harvest/requirements.txt
git clone https://github.com/ckan/ckanext-scheming
pip install -e ckanext-scheming
git clone https://github.com/ckan/ckanext-fluent
pip install -e ckanext-fluent
- name: Setup extension
run: |
ckan -c test.ini db init
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ To run the tests do:

pytest --ckan-ini=test.ini ckanext/dcat/tests

Note that there are tests relying on having [ckanext-harvest](https://github.com/ckan/ckanext-harvest), [ckanext-scheming](https://github.com/ckan/ckanext-scheming) and [ckanext-fluent](https://github.com/ckan/ckanext-fluent) installed.

## Releases

To create a new release, follow these steps:
Expand Down
4 changes: 2 additions & 2 deletions ckanext/dcat/plugins/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,9 +183,9 @@ def before_dataset_index(self, dataset_dict):
# Index a flattened version
new_key = f'extras_{field["field_name"]}__{key}'
if not dataset_dict.get(new_key):
dataset_dict[new_key] = value
dataset_dict[new_key] = str(value)
else:
dataset_dict[new_key] += ' ' + value
dataset_dict[new_key] += ' ' + str(value)

subfields = dataset_dict.pop(field['field_name'], None)
if field['field_name'] == 'spatial_coverage':
Expand Down
3 changes: 3 additions & 0 deletions ckanext/dcat/profiles/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
from .base import RDFProfile, CleanedURIRef
from .base import (
CNT,
RDF,
XSD,
SKOS,
RDFS,
DCAT,
DCATAP,
DCATUS,
DCT,
ADMS,
VCARD,
Expand All @@ -21,5 +23,6 @@
from .euro_dcat_ap import EuropeanDCATAPProfile
from .euro_dcat_ap_2 import EuropeanDCATAP2Profile
from .euro_dcat_ap_3 import EuropeanDCATAP3Profile
from .dcat_us_3 import DCATUS3Profile
from .euro_dcat_ap_scheming import EuropeanDCATAPSchemingProfile
from .schemaorg import SchemaOrgProfile
177 changes: 157 additions & 20 deletions ckanext/dcat/profiles/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from dateutil.parser import parse as parse_date
from rdflib import term, URIRef, BNode, Literal
from rdflib.namespace import Namespace, RDF, XSD, SKOS, RDFS
from rdflib.namespace import Namespace, RDF, XSD, SKOS, RDFS, ORG
from geomet import wkt, InvalidGeoJSONException

from ckantoolkit import config, url_for, asbool, aslist, get_action, ObjectNotFound
Expand All @@ -13,9 +13,11 @@
from ckanext.dcat.utils import DCAT_EXPOSE_SUBCATALOGS
from ckanext.dcat.validators import is_year, is_year_month, is_date

CNT = Namespace("http://www.w3.org/2011/content#")
DCT = Namespace("http://purl.org/dc/terms/")
DCAT = Namespace("http://www.w3.org/ns/dcat#")
DCATAP = Namespace("http://data.europa.eu/r5r/")
DCATUS = Namespace("http://resources.data.gov/ontology/dcat-us#")
ADMS = Namespace("http://www.w3.org/ns/adms#")
VCARD = Namespace("http://www.w3.org/2006/vcard/ns#")
FOAF = Namespace("http://xmlns.com/foaf/0.1/")
Expand All @@ -27,9 +29,11 @@
SPDX = Namespace("http://spdx.org/rdf/terms#")

namespaces = {
"cnt": CNT,
"dct": DCT,
"dcat": DCAT,
"dcatap": DCATAP,
"dcatus": DCATUS,
"adms": ADMS,
"vcard": VCARD,
"foaf": FOAF,
Expand All @@ -39,6 +43,7 @@
"locn": LOCN,
"gsp": GSP,
"owl": OWL,
"org": ORG,
"spdx": SPDX,
}

Expand Down Expand Up @@ -69,7 +74,7 @@ class URIRefOrLiteral(object):
Like CleanedURIRef, this is a factory class.
"""

def __new__(cls, value):
def __new__(cls, value, lang=None):
try:
stripped_value = value.strip()
if isinstance(value, str) and (
Expand All @@ -83,10 +88,10 @@ def __new__(cls, value):
# URI is fine, return the object
return uri_obj
else:
return Literal(value)
return Literal(value, lang=lang)
except Exception:
# In case something goes wrong: use Literal
return Literal(value)
return Literal(value, lang=lang)


class CleanedURIRef(object):
Expand Down Expand Up @@ -123,6 +128,8 @@ class RDFProfile(object):

_dataset_schema = None

_form_languages = None

# Cache for mappings of licenses URL/title to ID built when needed in
# _license().
_licenceregister_cache = None
Expand All @@ -145,6 +152,9 @@ def __init__(self, graph, dataset_type="dataset", compatibility_mode=False):

self.compatibility_mode = compatibility_mode

self._default_lang = config.get("ckan.locale_default", "en")


try:
schema_show = get_action("scheming_dataset_schema_show")
try:
Expand All @@ -157,6 +167,9 @@ def __init__(self, graph, dataset_type="dataset", compatibility_mode=False):
except KeyError:
pass

if self._dataset_schema:
self._form_languages = self._dataset_schema.get("form_languages")

def _datasets(self):
"""
Generator that returns all DCAT datasets on the graph
Expand Down Expand Up @@ -201,21 +214,40 @@ def _object(self, subject, predicate):
return _object
return None

def _object_value(self, subject, predicate):
def _object_value(self, subject, predicate, multilingual=False):
"""
Given a subject and a predicate, returns the value of the object
Both subject and predicate must be rdflib URIRef or BNode objects
If found, the string representation is returned, else an empty string
If multilingual is True, a dict with the language codes as keys will be
returned for each language found. e.g.
{
"en": "Dataset title",
"es": "Título del conjunto de datos"
}
If one of the languages defined in `form_languages` in the schema is not
found in the graph, an empty string will be returned.
{
"en": "Dataset title",
"es": ""
}
"""
default_lang = config.get("ckan.locale_default", "en")
if multilingual:
return self._object_value_multilingual(subject, predicate)
fallback = ""
for o in self.g.objects(subject, predicate):
if isinstance(o, Literal):
if o.language and o.language == default_lang:
if o.language and o.language == self._default_lang:
return str(o)
# Use first object as fallback if no object with the default language is available
# Use first object as fallback if no object with the default
# language is available
elif fallback == "":
fallback = str(o)
elif len(list(self.g.objects(o, RDFS.label))):
Expand All @@ -224,6 +256,31 @@ def _object_value(self, subject, predicate):
return str(o)
return fallback

def _object_value_multilingual(self, subject, predicate):
out = {}
for o in self.g.objects(subject, predicate):

if isinstance(o, Literal):
if o.language:
out[o.language] = str(o)
else:
out[self._default_lang] = str(o)
elif len(list(self.g.objects(o, RDFS.label))):
for label in self.g.objects(o, RDFS.label):
if label.language:
out[label.language] = str(label)
else:
out[self._default_lang] = str(label)
else:
out[self._default_lang] = str(o)

if self._form_languages:
for lang in self._form_languages:
if lang not in out:
out[lang] = ""

return out

def _object_value_multiple_predicate(self, subject, predicates):
"""
Given a subject and a list of predicates, returns the value of the object
Expand Down Expand Up @@ -301,10 +358,45 @@ def _object_value_list(self, subject, predicate):
Both subject and predicate must be rdflib URIRef or BNode objects
If no values found, returns an empty string
If no values found, returns an empty list
"""
return [str(o) for o in self.g.objects(subject, predicate)]

def _object_value_list_multilingual(self, subject, predicate):
"""
Given a subject and a predicate, returns a dict with the language codes
as keys and the list of object values as values. e.g.
{
"en": ["Oaks", "Pines"],
"es": ["Robles", "Pinos"],
}
If one of the languages defined in `form_languages` in the schema is not
found in the graph, an empty list will be returned.
{
"en": ["Oaks", "Pines"],
"es": [],
}
Both subject and predicate must be rdflib URIRef or BNode objects
If no values found, returns an empty list
"""
out = {}
for o in self.g.objects(subject, predicate):
lang = o.language or self._default_lang
if lang not in out:
out[lang] = []
out[lang].append(str(o))

if self._form_languages:
for lang in self._form_languages:
if lang not in out:
out[lang] = []
return out

def _get_vcard_property_value(
self, subject, predicate, predicate_string_property=None
):
Expand Down Expand Up @@ -718,6 +810,9 @@ def _read_list_value(self, value):
items = value.split(",")
else:
items = [value] # Normal text value
elif isinstance(value, ((int, float, complex))):
items = [value] # number

return items

def _add_spatial_value_to_graph(self, spatial_ref, predicate, value):
Expand Down Expand Up @@ -786,18 +881,25 @@ def _add_statement_to_graph(self, data_dict, key, subject, predicate, _class=Non
"""
value = self._get_dict_value(data_dict, key)
if value:
_object = URIRefOrLiteral(value)
if isinstance(_object, Literal):
statement_ref = BNode()
self.g.add((subject, predicate, statement_ref))
if _class:
self.g.add((statement_ref, RDF.type, _class))
self.g.add((statement_ref, RDFS.label, _object))

if isinstance(value, dict):
_objects = []
for lang in value:
_objects.append(URIRefOrLiteral(value[lang], lang))
else:
self.g.add((subject, predicate, _object))
if _class:
self.g.add((_object, RDF.type, _class))
_objects = [URIRefOrLiteral(value)]
statement_ref = None
for _object in _objects:
if isinstance(_object, Literal):
if not statement_ref:
statement_ref = BNode()
self.g.add((subject, predicate, statement_ref))
if _class:
self.g.add((statement_ref, RDF.type, _class))
self.g.add((statement_ref, RDFS.label, _object))
else:
self.g.add((subject, predicate, _object))
if _class:
self.g.add((_object, RDF.type, _class))

def _schema_field(self, key):
"""
Expand All @@ -823,6 +925,32 @@ def _schema_resource_field(self, key):
if field["field_name"] == key:
return field

def _multilingual_dataset_fields(self):
"""
Return a list of field names in the dataset shema that have multilingual
values (i.e. that use one of the fluent presets)
"""
return self._multilingual_fields(entity="dataset")

def _multilingual_resource_fields(self):
"""
Return a list of field names in the resource schema that have multilingual
values (i.e. that use one of the fluent presets)
"""
return self._multilingual_fields(entity="resource")

def _multilingual_fields(self, entity="dataset"):
if not self._dataset_schema:
return []

out = []
for field in self._dataset_schema[f"{entity}_fields"]:
if field.get("validators") and any(
v for v in field["validators"].split() if v.startswith("fluent")
):
out.append(field["field_name"])
return out

def _set_dataset_value(self, dataset_dict, key, value):
"""
Sets the value for a given key in a CKAN dataset dict
Expand Down Expand Up @@ -949,7 +1077,16 @@ def _add_triple_from_dict(
elif value and date_value:
self._add_date_triple(subject, predicate, value, _type)
elif value:
# If it is a dict, we assume it's a fluent multilingual field
if isinstance(value, dict):
# We assume that all translated field values are Literals
for lang, translated_value in value.items():
object = Literal(translated_value, datatype=_datatype, lang=lang)
self.g.add((subject, predicate, object))
return

# Normal text value

# ensure URIRef items are preprocessed (space removal/url encoding)
if _type == URIRef:
_type = CleanedURIRef
Expand Down
Loading

0 comments on commit e957890

Please sign in to comment.