-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtranslate.py
128 lines (116 loc) · 6.03 KB
/
translate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import logging
import pathlib
import xmltodict
from json2html import *
import os
import json
from glob import glob
import xml.etree.cElementTree as ET
import datetime
class Translate:
#ns = {"igsn": "http://igsn.org/schema/kernel-v.1.0", "gesep": "http://www.gesep.org/coreMetadata"}
def __init__(self, domain, xmldir, htmldir):
self.domain = domain
self.xml_dir = xmldir
self.html_dir = htmldir
self.sitemap = []
self.root_folder = None
self.root_path = os.path.dirname(os.path.abspath(__file__))
self.root_folder = os.path.basename(self.root_path)
self.logger = logging.getLogger(self.__class__.__name__)
def parseXMLs(self):
xmlfiles_dir = os.path.join(self.root_path, self.xml_dir)
xmlfiles = [y for x in os.walk(xmlfiles_dir) for y in glob(os.path.join(x[0], '*.xml'))]
for xmlfile in xmlfiles:
json_dict = None
jsonld = None
with open(xmlfile) as fd:
json_raw = xmltodict.parse(fd.read())
json_raw = json.loads(json.dumps(json_raw))
if 'cores' in xmlfile:
resource_type =' http://vocabulary.odm2.org/specimentype/core'
pass
#json_dict, jsonld = self.convert_core_hole(json_raw) # TODO jsonld
elif 'samples' in xmlfile:
resource_type ='http://pid.geoscience.gov.au/def/voc/ga/sampletype/borehole_specimen'
json_dict, jsonld = self.convert_sample_section(resource_type,json_raw)
elif 'sections' in xmlfile:
resource_type = 'http://vocabulary.odm2.org/specimentype/coreSection'
json_dict, jsonld = self.convert_sample_section(resource_type,json_raw)
else: # TODO site?
pass
if json_dict and jsonld:
# TODO generate and write html file and then append the path to the sitemap list (include server:port)
html = self.generate_html(json_dict, jsonld)
html_file_path = xmlfile.replace(self.xml_dir, self.html_dir)
html_file_path = html_file_path.replace('.xml', '.html')
sub_folder = os.path.dirname(html_file_path)
if not os.path.exists(sub_folder):
os.makedirs(sub_folder)
self.logger.debug('Creating html files sub-dir - {}'.format(sub_folder))
with open(html_file_path, "w") as file:
file.write(str(html))
sitemap_path = os.path.relpath(html_file_path)
self.sitemap.append(sitemap_path) #pages\357\samples\357_70_C_1_1_5030293.html
self.logger.info('Total html files generated - {}'.format(len(self.sitemap)))
def convert_core_hole(self, raw):
jsondict = None
jsonld = None
core = raw.get('core')
#context = core.get('@xmlns')
core_id = core.get('@ID')
coreDetails = core.get('coreDetails')
hole = core.get('hole')
hole_id = hole.get('@ID')
return jsondict, jsonld
def convert_sample_section(self, resource_type, raw):
jsonld = {}
sample = raw.get('sample')
context_igsn_main = "https://raw.githubusercontent.com/IGSN/igsn-json/master/schema.igsn.org/json/registration/v0.1/context.jsonld"
jsonld["@context"] = context_igsn_main
#jsonld["@id"] = sample.get('dislink')
jsonld["@id"] = 'http://igsn.org/'+sample.get('sampleNumber').get('#text')
jsonld["@type"] = "Sample"
jsonld["additionalType"] = resource_type
jsonld["igsn"] = 'http://hdl.handle.net/'+sample.get('sampleNumber').get('#text')
jsonld["registrant"] = {"name": sample.get('registrant').get('registrantName')}
related = []
rel_identifiers = sample.get('relatedResourceIdentifiers').get('relatedIdentifier')
if isinstance(rel_identifiers, dict): # only one relation exists, so convert into a list
# example {'@relatedIdentifierType': 'handle', '@relationType': 'IsPartOf', '#text': '10273/IBCR0357ESJ0001'}
rel_identifiers = [rel_identifiers]
for r in rel_identifiers:
iden = r.get('#text')
identype = r.get('@relationType')
relation = r.get('@relatedIdentifierType')
related.append({"identifier": {"id": iden, "kind": relation}, "relationship": identype})
jsonld["related"] = related
ele = sample.get('log').get('logElement')
logtype = ele.get('@event')
timestamp = ele.get('@timeStamp')
comment = ele.get('@comment')
jsonld["log"] = {"type": logtype, "timestamp": timestamp, "comment": comment}
return sample, jsonld
def generate_html(self, json_dict, jsonld):
js = '<script type="application/ld+json">{}</script>'.format(json.dumps(jsonld))
html_sub = json2html.convert(json=json_dict)
html_1 = '<html><head>' + js + '</head>'
html_2 = '<body>' + html_sub + '</body></html>'
html = html_1 + html_2
return html
def generate_sitemap(self):
root = ET.Element('urlset')
#root.attrib['xmlns:xsi'] = "http://www.w3.org/2001/XMLSchema-instance"
#root.attrib['xsi:schemaLocation'] = "http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd"
root.attrib['xmlns'] = "http://www.sitemaps.org/schemas/sitemap/0.9"
for site in self.sitemap:
site_root = site.replace('\\', '/')
dt = datetime.datetime.now().strftime("%Y-%m-%d")
doc = ET.SubElement(root, "url")
html_page_url ='{}{}/{}'.format(self.domain, self.root_folder, site_root)
ET.SubElement(doc, "loc").text = html_page_url
ET.SubElement(doc, "lastmod").text = dt
#ET.SubElement(doc, "changefreq").text = "weekly"
#ET.SubElement(doc, "priority").text = "1.0"
tree = ET.ElementTree(root)
tree.write('sitemap.xml', encoding='utf-8', xml_declaration=True)