Merge branch 'develop'

weso · May 9, 2024 · 809f506 · 809f506
2 parents 3cb395f + b0cac9c
commit 809f506
Show file tree

Hide file tree

Showing 17 changed files with 132 additions and 13 deletions.
diff --git a/README.md b/README.md
@@ -192,7 +192,7 @@ You must provide at least an input: a file, a string, an endpoint, a remote grap
 * disable_endpoint_cache (default False). By default, if sheXer is told to consume triples from an endpoint, it will make some SPARQL queries and store the results in a local graph. If this parameter is set to True, sheXer won't save that content locally. This will help to reduce main memory usage, but will decrease the performance, as sheXer will need to make more SPARQL queries to the endpoint.
 * namespaces_dict (default None): dictionary in which the keys are namespaces and the values are their expected prefixes in the outputs. 
 * input_format (default "NT"): the format of the graph which is going to be computed. The default value is const.NT. IMPORTANT: currently, sheXer does not guess input format, so ensure you specify the format here in case you are not providing n-triples content. In case you provide a combined input (several files, several URLs...) they all should have the same format. If you work against an endpoit, then this param do not have any effect.
-* compression_mode (default None). Only when you are working with local files, if they are compressed, you do not need to uncompress to parse them. Currently supported formats are ZIP and GZ. Set compression_format to "zip" or "gz" to work with such files. Each gz file will be assumed to contain a single graph file. Each zip file will be assumed to be a directory containing one or more graph files. In case the zip contains several files, they will be all parsed and merged (they should have the same format, indicated with input_format). In every case, sheXer won't write any uncompressed content to your disk.
+* compression_mode (default None). Only when you are working with local files, if they are compressed, you do not need to uncompress to parse them. Currently supported formats are ZIP, GZ, and XZ. Set compression_format to "zip", "gz", or "xz" to work with such files. Each gz or xz file will be assumed to contain a single graph file. Each zip file will be assumed to be a directory containing one or more graph files. In case the zip contains several files, they will be all parsed and merged (they should have the same format, indicated with input_format). In every case, sheXer won't write any uncompressed content to your disk.
 
 #### Params to tune the shexing process
 

diff --git a/requirements.txt b/requirements.txt
@@ -4,3 +4,4 @@ rdflib==6.0.2
 SPARQLWrapper==1.8.4
 wlighter==1.0.1
 plantuml==0.3.0
+python-xz==0.5.0
diff --git a/setup.py b/setup.py
@@ -8,12 +8,12 @@ def read(file_path):
 setup(
   name = 'shexer',
   packages = find_packages(exclude=["*.local_code.*"]), # this must be the same as the name above
-  version = '2.5.0',
+  version = '2.5.1',
   description = 'Automatic schema extraction for RDF graphs',
   author = 'Daniel Fernandez-Alvarez',
   author_email = '[email protected]',
   url = 'https://github.com/DaniFdezAlvarez/shexer',
-  download_url = 'https://github.com/DaniFdezAlvarez/shexer/archive/2.5.0.tar.gz',
+  download_url = 'https://github.com/DaniFdezAlvarez/shexer/archive/2.5.1.tar.gz',
   keywords = ['testing', 'shexer', 'shexerp3', "rdf", "shex", "shacl", "schema"],
   long_description = read('README.md'),
   long_description_content_type='text/markdown',

diff --git a/shexer/consts.py b/shexer/consts.py
@@ -25,6 +25,7 @@
 #COMPRESSION FORMATS
 ZIP = "zip"
 GZ = "gz"
+XZ = "xz"
 
 # FREQUENCY MODES
 

diff --git a/shexer/io/graph/yielder/base_triples_yielder.py b/shexer/io/graph/yielder/base_triples_yielder.py
@@ -3,8 +3,9 @@
 from shexer.io.line_reader.raw_string_line_reader import RawStringLineReader
 from shexer.io.line_reader.gz_line_reader import GzFileLineReader
 from shexer.io.line_reader.zip_file_line_reader import ZipFileLineReader
+from shexer.io.line_reader.xz_line_reader import XzFileLineReader
 from shexer.utils.obj_references import check_just_one_not_none
-from shexer.consts import ZIP, GZ
+from shexer.consts import ZIP, GZ, XZ
 
 class BaseTriplesYielder(object):
 
@@ -25,6 +26,8 @@ def _decide_line_reader(self, raw_graph, source_file,
         elif compression_mode == ZIP:
             return ZipFileLineReader(zip_archive=zip_base_archive,
                                      zip_target=source_file)
+        elif compression_mode == XZ:
+            return XzFileLineReader(xz_file=source_file)
         else:
             raise ValueError("Unsupported compression mode: {}".format(compression_mode))
 

diff --git a/shexer/io/graph/yielder/rdflib_triple_yielder.py b/shexer/io/graph/yielder/rdflib_triple_yielder.py
@@ -1,14 +1,14 @@
 from rdflib.graph import Graph, URIRef, Literal, BNode
 from shexer.io.graph.yielder.base_triples_yielder import BaseTriplesYielder
-from shexer.consts import N3, TURTLE, RDF_XML, NT, JSON_LD, ZIP, GZ
+from shexer.consts import N3, TURTLE, RDF_XML, NT, JSON_LD, ZIP, GZ, XZ
 
 from shexer.model.Literal import Literal as model_Literal
 from shexer.model.IRI import IRI as model_IRI
 from shexer.model.bnode import BNode as model_BNode
 from shexer.model.property import Property as model_Property
 
 from shexer.utils.uri import decide_literal_type
-from shexer.utils.compression import get_content_gz_file, get_content_zip_internal_file
+from shexer.utils.compression import get_content_gz_file, get_content_zip_internal_file, get_content_xz_file
 
 _SUPPORTED_FORMATS = [N3, TURTLE, RDF_XML, NT, JSON_LD]
 
@@ -155,6 +155,8 @@ def _parse_compressed_files(self, rdflib_graph):
             rdflib_graph.parse(data=get_content_zip_internal_file(base_archive=self._zip_archive_file,
                                                                   target_file=self._source),
                                format=self._input_format)
+        elif self._compression_mode == XZ:
+            rdflib_graph.parse(data=get_content_xz_file(self._source), format=self._input_format)
         else:
             raise ValueError("Unknown compression format")
 

diff --git a/shexer/io/line_reader/gz_line_reader.py b/shexer/io/line_reader/gz_line_reader.py
@@ -8,5 +8,4 @@ def __init__(self, gz_file):
     def read_lines(self):
         with gzip.open(self._gz_file, "r") as in_stream:
             for a_line in in_stream:
-                a = a_line.decode("utf-8")
                 yield a_line.decode("utf-8")
diff --git a/shexer/io/line_reader/xz_line_reader.py b/shexer/io/line_reader/xz_line_reader.py
@@ -0,0 +1,12 @@
+from xz import open as xzopen
+
+
+class XzFileLineReader(object):
+
+    def __init__(self, xz_file):
+        self._xz_file = xz_file
+
+    def read_lines(self):
+        with xzopen(self._xz_file, "r") as in_stream:
+            for a_line in in_stream:
+                yield a_line.decode("utf-8")
diff --git a/shexer/shaper.py b/shexer/shaper.py
@@ -1,7 +1,7 @@
 from shexer.utils.obj_references import check_just_one_not_none
 
 from shexer.consts import SHEXC, SHACL_TURTLE, NT, TSV_SPO, N3, TURTLE, TURTLE_ITER, \
-    RDF_XML, FIXED_SHAPE_MAP, JSON_LD, RDF_TYPE, SHAPES_DEFAULT_NAMESPACE, ZIP, GZ, \
+    RDF_XML, FIXED_SHAPE_MAP, JSON_LD, RDF_TYPE, SHAPES_DEFAULT_NAMESPACE, ZIP, GZ, XZ, \
     ALL_EXAMPLES, CONSTRAINT_EXAMPLES, SHAPE_EXAMPLES
 from shexer.utils.factories.class_profiler_factory import get_class_profiler
 from shexer.utils.factories.instance_tracker_factory import get_instance_tracker
@@ -430,11 +430,11 @@ def _check_input_format(input_format):
 
     @staticmethod
     def _check_compression_mode(compression_mode, url_endpoint, url_graph_input, list_of_url_input):
-        if compression_mode not in [ZIP, GZ, None]:
+        if compression_mode not in [ZIP, GZ, XZ, None]:
             raise ValueError("Unknownk compression mode: {}. "
                              "The currently supported compression formats are {}.".format(
                 compression_mode,
-                ", ".join([ZIP, GZ])))
+                ", ".join([ZIP, GZ, XZ])))
         if compression_mode is not None and (url_endpoint is not None or url_graph_input is not None or list_of_url_input is not None):
             raise ValueError("You've chosed some compression mode ({}) to work with remote sources."
                              "Currently, sheXer can only parse compressed local files".format(compression_mode))

diff --git a/shexer/utils/compression.py b/shexer/utils/compression.py
@@ -1,8 +1,15 @@
-import gzip
+from gzip import open as gzopen
 from zipfile import ZipFile
+from xz import open as xzopen
+
+
+def get_content_xz_file(xz_path):
+    with xzopen(xz_path, "r") as in_stream:
+        return in_stream.read()
+
 
 def get_content_gz_file(gz_path):
-    with gzip.open(gz_path, "r") as in_stream:
+    with gzopen(gz_path, "r") as in_stream:
         return in_stream.read()
 
 

diff --git a/test/t_files/compression/t_graph_1.json.xz b/test/t_files/compression/t_graph_1.json.xz
diff --git a/test/t_files/compression/t_graph_1.n3.xz b/test/t_files/compression/t_graph_1.n3.xz
diff --git a/test/t_files/compression/t_graph_1.nt.xz b/test/t_files/compression/t_graph_1.nt.xz
diff --git a/test/t_files/compression/t_graph_1.tsv.xz b/test/t_files/compression/t_graph_1.tsv.xz
diff --git a/test/t_files/compression/t_graph_1.ttl.xz b/test/t_files/compression/t_graph_1.ttl.xz
diff --git a/test/t_files/compression/t_graph_1.xml.xz b/test/t_files/compression/t_graph_1.xml.xz
diff --git a/test/test_compression_mode.py b/test/test_compression_mode.py
@@ -4,7 +4,7 @@
 from test.t_utils import file_vs_str_tunned_comparison
 import os.path as pth
 
-from shexer.consts import TURTLE_ITER, GZ, ZIP, N3, TURTLE, RDF_XML, TSV_SPO, NT, JSON_LD
+from shexer.consts import TURTLE_ITER, GZ, ZIP, XZ, N3, TURTLE, RDF_XML, TSV_SPO, NT, JSON_LD
 
 
 
@@ -199,6 +199,100 @@ def test_json_zip(self):
         self.assertTrue(file_vs_str_tunned_comparison(file_path=G1_ALL_CLASSES_NO_COMMENTS,
                                                       str_target=str_result))
 
+
+    ########################  xz
+
+    def test_ttl_iter_xz(self):
+        shaper = Shaper(
+            graph_file_input=_BASE_DIR + "t_graph_1.ttl.xz",
+            namespaces_dict=default_namespaces(),
+            all_classes_mode=True,
+            input_format=TURTLE_ITER,
+            disable_comments=True,
+            compression_mode=XZ
+        )
+        str_result = shaper.shex_graph(string_output=True)
+        self.assertTrue(file_vs_str_tunned_comparison(file_path=G1_ALL_CLASSES_NO_COMMENTS,
+                                                      str_target=str_result))
+
+    def test_n3_xz(self):
+        shaper = Shaper(
+            graph_file_input=_BASE_DIR + "t_graph_1.n3.xz",
+            namespaces_dict=default_namespaces(),
+            all_classes_mode=True,
+            input_format=N3,
+            disable_comments=True,
+            compression_mode=XZ
+        )
+        str_result = shaper.shex_graph(string_output=True)
+        self.assertTrue(file_vs_str_tunned_comparison(file_path=G1_ALL_CLASSES_NO_COMMENTS,
+                                                      str_target=str_result))
+
+    def test_json_xz(self):
+        shaper = Shaper(
+            graph_file_input=_BASE_DIR + "t_graph_1.json.xz",
+            namespaces_dict=default_namespaces(),
+            all_classes_mode=True,
+            input_format=JSON_LD,
+            disable_comments=True,
+            compression_mode=XZ
+        )
+        str_result = shaper.shex_graph(string_output=True)
+        self.assertTrue(file_vs_str_tunned_comparison(file_path=G1_ALL_CLASSES_NO_COMMENTS,
+                                                      str_target=str_result))
+
+    def test_ttl_rdflib_xz(self):
+        shaper = Shaper(
+            graph_file_input=_BASE_DIR + "t_graph_1.ttl.xz",
+            namespaces_dict=default_namespaces(),
+            all_classes_mode=True,
+            input_format=TURTLE,
+            disable_comments=True,
+            compression_mode=XZ
+        )
+        str_result = shaper.shex_graph(string_output=True)
+        self.assertTrue(file_vs_str_tunned_comparison(file_path=G1_ALL_CLASSES_NO_COMMENTS,
+                                                      str_target=str_result))
+
+    def test_xml_xz(self):
+        shaper = Shaper(
+            graph_file_input=_BASE_DIR + "t_graph_1.xml.xz",
+            namespaces_dict=default_namespaces(),
+            all_classes_mode=True,
+            input_format=RDF_XML,
+            disable_comments=True,
+            compression_mode=XZ
+        )
+        str_result = shaper.shex_graph(string_output=True)
+        self.assertTrue(file_vs_str_tunned_comparison(file_path=G1_ALL_CLASSES_NO_COMMENTS,
+                                                      str_target=str_result))
+
+    def test_tsv_spo_xz(self):
+        shaper = Shaper(
+            graph_file_input=_BASE_DIR + "t_graph_1.tsv.xz",
+            namespaces_dict=default_namespaces(),
+            all_classes_mode=True,
+            input_format=TSV_SPO,
+            disable_comments=True,
+            compression_mode=XZ
+        )
+        str_result = shaper.shex_graph(string_output=True)
+        self.assertTrue(file_vs_str_tunned_comparison(file_path=G1_ALL_CLASSES_NO_COMMENTS,
+                                                      str_target=str_result))
+
+    def test_nt_xz(self):
+        shaper = Shaper(
+            graph_file_input=_BASE_DIR + "t_graph_1.nt.xz",
+            namespaces_dict=default_namespaces(),
+            all_classes_mode=True,
+            input_format=NT,
+            disable_comments=True,
+            compression_mode=XZ
+        )
+        str_result = shaper.shex_graph(string_output=True)
+        self.assertTrue(file_vs_str_tunned_comparison(file_path=G1_ALL_CLASSES_NO_COMMENTS,
+                                                      str_target=str_result))
+
     ########################  Wrong params
 
     def test_unknown_mode(self):