Skip to content

Commit

Permalink
Merge branch 'develop'
Browse files Browse the repository at this point in the history
  • Loading branch information
DaniFdezAlvarez committed May 9, 2024
2 parents 3cb395f + b0cac9c commit 809f506
Show file tree
Hide file tree
Showing 17 changed files with 132 additions and 13 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ You must provide at least an input: a file, a string, an endpoint, a remote grap
* disable_endpoint_cache (default False). By default, if sheXer is told to consume triples from an endpoint, it will make some SPARQL queries and store the results in a local graph. If this parameter is set to True, sheXer won't save that content locally. This will help to reduce main memory usage, but will decrease the performance, as sheXer will need to make more SPARQL queries to the endpoint.
* namespaces_dict (default None): dictionary in which the keys are namespaces and the values are their expected prefixes in the outputs.
* input_format (default "NT"): the format of the graph which is going to be computed. The default value is const.NT. IMPORTANT: currently, sheXer does not guess input format, so ensure you specify the format here in case you are not providing n-triples content. In case you provide a combined input (several files, several URLs...) they all should have the same format. If you work against an endpoit, then this param do not have any effect.
* compression_mode (default None). Only when you are working with local files, if they are compressed, you do not need to uncompress to parse them. Currently supported formats are ZIP and GZ. Set compression_format to "zip" or "gz" to work with such files. Each gz file will be assumed to contain a single graph file. Each zip file will be assumed to be a directory containing one or more graph files. In case the zip contains several files, they will be all parsed and merged (they should have the same format, indicated with input_format). In every case, sheXer won't write any uncompressed content to your disk.
* compression_mode (default None). Only when you are working with local files, if they are compressed, you do not need to uncompress to parse them. Currently supported formats are ZIP, GZ, and XZ. Set compression_format to "zip", "gz", or "xz" to work with such files. Each gz or xz file will be assumed to contain a single graph file. Each zip file will be assumed to be a directory containing one or more graph files. In case the zip contains several files, they will be all parsed and merged (they should have the same format, indicated with input_format). In every case, sheXer won't write any uncompressed content to your disk.

#### Params to tune the shexing process

Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ rdflib==6.0.2
SPARQLWrapper==1.8.4
wlighter==1.0.1
plantuml==0.3.0
python-xz==0.5.0
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,12 @@ def read(file_path):
setup(
name = 'shexer',
packages = find_packages(exclude=["*.local_code.*"]), # this must be the same as the name above
version = '2.5.0',
version = '2.5.1',
description = 'Automatic schema extraction for RDF graphs',
author = 'Daniel Fernandez-Alvarez',
author_email = '[email protected]',
url = 'https://github.com/DaniFdezAlvarez/shexer',
download_url = 'https://github.com/DaniFdezAlvarez/shexer/archive/2.5.0.tar.gz',
download_url = 'https://github.com/DaniFdezAlvarez/shexer/archive/2.5.1.tar.gz',
keywords = ['testing', 'shexer', 'shexerp3', "rdf", "shex", "shacl", "schema"],
long_description = read('README.md'),
long_description_content_type='text/markdown',
Expand Down
1 change: 1 addition & 0 deletions shexer/consts.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#COMPRESSION FORMATS
ZIP = "zip"
GZ = "gz"
XZ = "xz"

# FREQUENCY MODES

Expand Down
5 changes: 4 additions & 1 deletion shexer/io/graph/yielder/base_triples_yielder.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
from shexer.io.line_reader.raw_string_line_reader import RawStringLineReader
from shexer.io.line_reader.gz_line_reader import GzFileLineReader
from shexer.io.line_reader.zip_file_line_reader import ZipFileLineReader
from shexer.io.line_reader.xz_line_reader import XzFileLineReader
from shexer.utils.obj_references import check_just_one_not_none
from shexer.consts import ZIP, GZ
from shexer.consts import ZIP, GZ, XZ

class BaseTriplesYielder(object):

Expand All @@ -25,6 +26,8 @@ def _decide_line_reader(self, raw_graph, source_file,
elif compression_mode == ZIP:
return ZipFileLineReader(zip_archive=zip_base_archive,
zip_target=source_file)
elif compression_mode == XZ:
return XzFileLineReader(xz_file=source_file)
else:
raise ValueError("Unsupported compression mode: {}".format(compression_mode))

Expand Down
6 changes: 4 additions & 2 deletions shexer/io/graph/yielder/rdflib_triple_yielder.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
from rdflib.graph import Graph, URIRef, Literal, BNode
from shexer.io.graph.yielder.base_triples_yielder import BaseTriplesYielder
from shexer.consts import N3, TURTLE, RDF_XML, NT, JSON_LD, ZIP, GZ
from shexer.consts import N3, TURTLE, RDF_XML, NT, JSON_LD, ZIP, GZ, XZ

from shexer.model.Literal import Literal as model_Literal
from shexer.model.IRI import IRI as model_IRI
from shexer.model.bnode import BNode as model_BNode
from shexer.model.property import Property as model_Property

from shexer.utils.uri import decide_literal_type
from shexer.utils.compression import get_content_gz_file, get_content_zip_internal_file
from shexer.utils.compression import get_content_gz_file, get_content_zip_internal_file, get_content_xz_file

_SUPPORTED_FORMATS = [N3, TURTLE, RDF_XML, NT, JSON_LD]

Expand Down Expand Up @@ -155,6 +155,8 @@ def _parse_compressed_files(self, rdflib_graph):
rdflib_graph.parse(data=get_content_zip_internal_file(base_archive=self._zip_archive_file,
target_file=self._source),
format=self._input_format)
elif self._compression_mode == XZ:
rdflib_graph.parse(data=get_content_xz_file(self._source), format=self._input_format)
else:
raise ValueError("Unknown compression format")

Expand Down
1 change: 0 additions & 1 deletion shexer/io/line_reader/gz_line_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,4 @@ def __init__(self, gz_file):
def read_lines(self):
with gzip.open(self._gz_file, "r") as in_stream:
for a_line in in_stream:
a = a_line.decode("utf-8")
yield a_line.decode("utf-8")
12 changes: 12 additions & 0 deletions shexer/io/line_reader/xz_line_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from xz import open as xzopen


class XzFileLineReader(object):

def __init__(self, xz_file):
self._xz_file = xz_file

def read_lines(self):
with xzopen(self._xz_file, "r") as in_stream:
for a_line in in_stream:
yield a_line.decode("utf-8")
6 changes: 3 additions & 3 deletions shexer/shaper.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from shexer.utils.obj_references import check_just_one_not_none

from shexer.consts import SHEXC, SHACL_TURTLE, NT, TSV_SPO, N3, TURTLE, TURTLE_ITER, \
RDF_XML, FIXED_SHAPE_MAP, JSON_LD, RDF_TYPE, SHAPES_DEFAULT_NAMESPACE, ZIP, GZ, \
RDF_XML, FIXED_SHAPE_MAP, JSON_LD, RDF_TYPE, SHAPES_DEFAULT_NAMESPACE, ZIP, GZ, XZ, \
ALL_EXAMPLES, CONSTRAINT_EXAMPLES, SHAPE_EXAMPLES
from shexer.utils.factories.class_profiler_factory import get_class_profiler
from shexer.utils.factories.instance_tracker_factory import get_instance_tracker
Expand Down Expand Up @@ -430,11 +430,11 @@ def _check_input_format(input_format):

@staticmethod
def _check_compression_mode(compression_mode, url_endpoint, url_graph_input, list_of_url_input):
if compression_mode not in [ZIP, GZ, None]:
if compression_mode not in [ZIP, GZ, XZ, None]:
raise ValueError("Unknownk compression mode: {}. "
"The currently supported compression formats are {}.".format(
compression_mode,
", ".join([ZIP, GZ])))
", ".join([ZIP, GZ, XZ])))
if compression_mode is not None and (url_endpoint is not None or url_graph_input is not None or list_of_url_input is not None):
raise ValueError("You've chosed some compression mode ({}) to work with remote sources."
"Currently, sheXer can only parse compressed local files".format(compression_mode))
Expand Down
11 changes: 9 additions & 2 deletions shexer/utils/compression.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,15 @@
import gzip
from gzip import open as gzopen
from zipfile import ZipFile
from xz import open as xzopen


def get_content_xz_file(xz_path):
with xzopen(xz_path, "r") as in_stream:
return in_stream.read()


def get_content_gz_file(gz_path):
with gzip.open(gz_path, "r") as in_stream:
with gzopen(gz_path, "r") as in_stream:
return in_stream.read()


Expand Down
Binary file added test/t_files/compression/t_graph_1.json.xz
Binary file not shown.
Binary file added test/t_files/compression/t_graph_1.n3.xz
Binary file not shown.
Binary file added test/t_files/compression/t_graph_1.nt.xz
Binary file not shown.
Binary file added test/t_files/compression/t_graph_1.tsv.xz
Binary file not shown.
Binary file added test/t_files/compression/t_graph_1.ttl.xz
Binary file not shown.
Binary file added test/t_files/compression/t_graph_1.xml.xz
Binary file not shown.
96 changes: 95 additions & 1 deletion test/test_compression_mode.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from test.t_utils import file_vs_str_tunned_comparison
import os.path as pth

from shexer.consts import TURTLE_ITER, GZ, ZIP, N3, TURTLE, RDF_XML, TSV_SPO, NT, JSON_LD
from shexer.consts import TURTLE_ITER, GZ, ZIP, XZ, N3, TURTLE, RDF_XML, TSV_SPO, NT, JSON_LD



Expand Down Expand Up @@ -199,6 +199,100 @@ def test_json_zip(self):
self.assertTrue(file_vs_str_tunned_comparison(file_path=G1_ALL_CLASSES_NO_COMMENTS,
str_target=str_result))


######################## xz

def test_ttl_iter_xz(self):
shaper = Shaper(
graph_file_input=_BASE_DIR + "t_graph_1.ttl.xz",
namespaces_dict=default_namespaces(),
all_classes_mode=True,
input_format=TURTLE_ITER,
disable_comments=True,
compression_mode=XZ
)
str_result = shaper.shex_graph(string_output=True)
self.assertTrue(file_vs_str_tunned_comparison(file_path=G1_ALL_CLASSES_NO_COMMENTS,
str_target=str_result))

def test_n3_xz(self):
shaper = Shaper(
graph_file_input=_BASE_DIR + "t_graph_1.n3.xz",
namespaces_dict=default_namespaces(),
all_classes_mode=True,
input_format=N3,
disable_comments=True,
compression_mode=XZ
)
str_result = shaper.shex_graph(string_output=True)
self.assertTrue(file_vs_str_tunned_comparison(file_path=G1_ALL_CLASSES_NO_COMMENTS,
str_target=str_result))

def test_json_xz(self):
shaper = Shaper(
graph_file_input=_BASE_DIR + "t_graph_1.json.xz",
namespaces_dict=default_namespaces(),
all_classes_mode=True,
input_format=JSON_LD,
disable_comments=True,
compression_mode=XZ
)
str_result = shaper.shex_graph(string_output=True)
self.assertTrue(file_vs_str_tunned_comparison(file_path=G1_ALL_CLASSES_NO_COMMENTS,
str_target=str_result))

def test_ttl_rdflib_xz(self):
shaper = Shaper(
graph_file_input=_BASE_DIR + "t_graph_1.ttl.xz",
namespaces_dict=default_namespaces(),
all_classes_mode=True,
input_format=TURTLE,
disable_comments=True,
compression_mode=XZ
)
str_result = shaper.shex_graph(string_output=True)
self.assertTrue(file_vs_str_tunned_comparison(file_path=G1_ALL_CLASSES_NO_COMMENTS,
str_target=str_result))

def test_xml_xz(self):
shaper = Shaper(
graph_file_input=_BASE_DIR + "t_graph_1.xml.xz",
namespaces_dict=default_namespaces(),
all_classes_mode=True,
input_format=RDF_XML,
disable_comments=True,
compression_mode=XZ
)
str_result = shaper.shex_graph(string_output=True)
self.assertTrue(file_vs_str_tunned_comparison(file_path=G1_ALL_CLASSES_NO_COMMENTS,
str_target=str_result))

def test_tsv_spo_xz(self):
shaper = Shaper(
graph_file_input=_BASE_DIR + "t_graph_1.tsv.xz",
namespaces_dict=default_namespaces(),
all_classes_mode=True,
input_format=TSV_SPO,
disable_comments=True,
compression_mode=XZ
)
str_result = shaper.shex_graph(string_output=True)
self.assertTrue(file_vs_str_tunned_comparison(file_path=G1_ALL_CLASSES_NO_COMMENTS,
str_target=str_result))

def test_nt_xz(self):
shaper = Shaper(
graph_file_input=_BASE_DIR + "t_graph_1.nt.xz",
namespaces_dict=default_namespaces(),
all_classes_mode=True,
input_format=NT,
disable_comments=True,
compression_mode=XZ
)
str_result = shaper.shex_graph(string_output=True)
self.assertTrue(file_vs_str_tunned_comparison(file_path=G1_ALL_CLASSES_NO_COMMENTS,
str_target=str_result))

######################## Wrong params

def test_unknown_mode(self):
Expand Down

0 comments on commit 809f506

Please sign in to comment.