-
Notifications
You must be signed in to change notification settings - Fork 566
/
Copy pathberkeleydb_example.py
138 lines (105 loc) · 4.04 KB
/
berkeleydb_example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
"""
BerkeleyDB in use as a persistent Graph store.
Example 1: simple actions
* creating a ConjunctiveGraph using the BerkeleyDB Store
* adding triples to it
* counting them
* closing the store, emptying the graph
* re-opening the store using the same DB files
* getting the same count of triples as before
Example 2: larger data
* loads multiple graphs downloaded from GitHub into a BerkeleyDB-baked graph stored in the folder gsq_vocabs.
* does not delete the DB at the end so you can see it on disk
"""
import os
import tempfile
from rdflib import ConjunctiveGraph, Literal, Namespace
from rdflib.plugins.stores.berkeleydb import has_bsddb
from rdflib.store import NO_STORE, VALID_STORE
def example_1():
"""Creates a ConjunctiveGraph and performs some BerkeleyDB tasks with it"""
path = tempfile.NamedTemporaryFile().name
# Declare we are using a BerkeleyDB Store
graph = ConjunctiveGraph("BerkeleyDB")
# Open previously created store, or create it if it doesn't exist yet
# (always doesn't exist in this example as using temp file location)
rt = graph.open(path, create=False)
if rt == NO_STORE:
# There is no underlying BerkeleyDB infrastructure, so create it
print("Creating new DB")
graph.open(path, create=True)
else:
print("Using existing DB")
assert rt == VALID_STORE, "The underlying store is corrupt"
print("Triples in graph before add:", len(graph))
print("(will always be 0 when using temp file for DB)")
# Now we'll add some triples to the graph & commit the changes
EG = Namespace("http://example.net/test/") # noqa: N806
graph.bind("eg", EG)
graph.add((EG["pic:1"], EG.name, Literal("Jane & Bob")))
graph.add((EG["pic:2"], EG.name, Literal("Squirrel in Tree")))
graph.commit()
print("Triples in graph after add:", len(graph))
print("(should be 2)")
# display the graph in Turtle
print(graph.serialize())
# close when done, otherwise BerkeleyDB will leak lock entries.
graph.close()
graph = None
# reopen the graph
graph = ConjunctiveGraph("BerkeleyDB")
graph.open(path, create=False)
print("Triples still in graph:", len(graph))
print("(should still be 2)")
graph.close()
# Clean up the temp folder to remove the BerkeleyDB database files...
for f in os.listdir(path):
os.unlink(path + "/" + f)
os.rmdir(path)
def example_2():
"""Loads a number of SKOS vocabularies from GitHub into a BerkeleyDB-backed graph stored in the local folder
'gsq_vocabs'
Should print out the number of triples after each load, e.g.:
177
248
289
379
421
628
764
813
965
1381
9666
9719
...
"""
import base64
import json
from urllib.error import HTTPError
from urllib.request import Request, urlopen
g = ConjunctiveGraph("BerkeleyDB")
g.open("gsg_vocabs", create=True)
# gsq_vocabs = "https://api.github.com/repos/geological-survey-of-queensland/vocabularies/git/trees/master"
gsq_vocabs = "https://api.github.com/repos/geological-survey-of-queensland/vocabularies/git/trees/cd7244d39337c1f4ef164b1cf1ea1f540a7277db"
try:
res = urlopen(Request(gsq_vocabs, headers={"Accept": "application/json"}))
except HTTPError as e:
return e.code, str(e), None
data = res.read()
encoding = res.info().get_content_charset("utf-8")
j = json.loads(data.decode(encoding))
for v in j["tree"]:
# process the element in GitHub result if it's a Turtle file
if v["path"].endswith(".ttl"):
# for each file, call it by URL, decode it and parse it into the graph
r = urlopen(v["url"])
content = json.loads(r.read().decode())["content"]
g.parse(data=base64.b64decode(content).decode(), format="turtle")
print(len(g))
print("loading complete")
if __name__ == "__main__":
if has_bsddb:
# Only run the examples if BerkeleyDB is available
example_1()
example_2()