-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
10 changed files
with
526 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
import fs from 'fs' | ||
import https from 'https' | ||
|
||
fs.readJsonSync = path => JSON.parse(fs.readFileSync(path)) | ||
|
||
const folderPath = 'sources/metadata' | ||
const { sources } = fs.readJsonSync('sources.json') | ||
|
||
const download = source => new Promise((resolve, reject) => { | ||
const outputPath = folderPath + '/' + source.split('/').at(-1) + '.xml' | ||
const file = fs.createWriteStream(outputPath) | ||
|
||
https.get(source, res => { | ||
if (res.statusCode === 302) https.get(res.headers.location, res => { | ||
if (res.statusCode === 200) res.pipe(file).on('close', resolve) | ||
else reject(res.statusCode) | ||
res.resume() | ||
}) | ||
else reject(res.statusCode) | ||
res.resume() | ||
}) | ||
}) | ||
|
||
folderPath.split('/').forEach((_, index, array) => { | ||
let folder = array.slice(0, index + 1).join('/') | ||
if (!fs.existsSync(folder)) fs.mkdirSync(folder) | ||
}) | ||
|
||
sources.forEach(async (source, index) => { | ||
await download(source) | ||
console.log(index + 1, '/', sources.length, 'downloaded') | ||
}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
import fs from 'fs' | ||
import Parser from 'xml2js' | ||
import https from 'https' | ||
|
||
fs.readJsonSync = path => JSON.parse(fs.readFileSync(path)) | ||
|
||
const folderPath = 'sources/adjusted' | ||
const { sources } = fs.readJsonSync('sources.json') | ||
|
||
function get(o, ...keys) { | ||
keys.forEach(key => o = o[key][0]) | ||
return o | ||
} | ||
|
||
folderPath.split('/').forEach((_, index, array) => { | ||
let folder = array.slice(0, index + 1).join('/') | ||
if (!fs.existsSync(folder)) fs.mkdirSync(folder) | ||
}) | ||
|
||
const annex = theme => new Promise((resolve, reject) => { | ||
https.get(theme + '/' + theme.slice('/').at(-1) + '.en.json', (res) => { | ||
if (res.statusCode !== 200) return reject(res.statusMessage) | ||
let body = [] | ||
|
||
res.on('data', data => body.push(data.toString())) | ||
res.on('end', () => resolve(JSON.parse(body.join()).theme.annex)) | ||
}); | ||
}) | ||
|
||
sources.forEach(source => { | ||
Parser.parseString(fs.readFileSync('sources/metadata/' + source.split('/').at(-1) + '.xml'), (err, data) => { | ||
if (err) return console.error(err) | ||
let metadata = data['gmd:MD_Metadata'] | ||
|
||
let citation = get(metadata, 'gmd:identificationInfo', 'gmd:MD_DataIdentification', 'gmd:citation', 'gmd:CI_Citation') | ||
let title = get(citation, 'gmd:title', 'gco:CharacterString') | ||
let doi = get(citation, 'gmd:identifier', 'gmd:MD_Identifier', 'gmd:code', 'gmx:Anchor')['$']['xlink:href'] | ||
|
||
let identificationInfo = get(metadata, 'gmd:identificationInfo', 'gmd:MD_DataIdentification') | ||
let descriptiveKeywords = identificationInfo['gmd:descriptiveKeywords'] | ||
.map(keyword => get(keyword, 'gmd:MD_Keywords', 'gmd:keyword')) | ||
.filter(keyword => keyword['gmx:Anchor']) | ||
.map(keyword => get(keyword, 'gmx:Anchor')['$']['xlink:href']) | ||
|
||
let resourceConstraints = identificationInfo['gmd:resourceConstraints'] | ||
.map(keyword => get(keyword, 'gmd:MD_LegalConstraints', 'gmd:otherConstraints', 'gmx:Anchor')['$']['xlink:href']) | ||
|
||
let theme = descriptiveKeywords.find(keyword => keyword.startsWith('http://inspire.ec.europa.eu/theme/')) | ||
// manually verfied that all links support https | ||
theme = theme.replace('http', 'https') | ||
|
||
let onLines = get(metadata, 'gmd:distributionInfo', 'gmd:MD_Distribution', 'gmd:transferOptions', 'gmd:MD_DigitalTransferOptions')['gmd:onLine'] | ||
.map(onLine => get(onLine, 'gmd:CI_OnlineResource', 'gmd:linkage', 'gmd:URL')) | ||
|
||
let desc = onLines.find(onLine => onLine.endsWith('.pdf') && onLine.includes('gpkg')) | ||
let gpkg = onLines.find(onLine => onLine.endsWith('.gpkg')) | ||
|
||
annex(theme).then(annex => { | ||
fs.writeFileSync(folderPath + '/' + source.split('/').at(-1) + '.json', JSON.stringify({ | ||
title, doi, | ||
theme, | ||
annex, | ||
desc, | ||
gpkg, | ||
resourceConstraints, | ||
}, null, 2)) | ||
}) | ||
}) | ||
}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
import fs from 'fs' | ||
import https from 'https' | ||
|
||
fs.readJsonSync = path => JSON.parse(fs.readFileSync(path)) | ||
|
||
const folderPath = 'sources/descriptions' | ||
const { sources } = fs.readJsonSync('sources.json') | ||
|
||
const download = url => new Promise((resolve, reject) => { | ||
const outputPath = folderPath + '/' + url.split('/').at(-1) | ||
const file = fs.createWriteStream(outputPath) | ||
|
||
https.get(url, res => { | ||
if (res.statusCode === 200) res.pipe(file).on('close', resolve) | ||
else reject(res.statusCode) | ||
res.resume() | ||
}) | ||
}) | ||
|
||
folderPath.split('/').forEach((_, index, array) => { | ||
let folder = array.slice(0, index + 1).join('/') | ||
if (!fs.existsSync(folder)) fs.mkdirSync(folder) | ||
}) | ||
|
||
sources.forEach(source => { | ||
let metadata = fs.readJsonSync('sources/adjusted/' + source.split('/').at(-1) + '.json') | ||
download(metadata.desc) | ||
}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,131 @@ | ||
import fs from 'fs' | ||
import PDF2Tree from 'pdf2tree' | ||
|
||
fs.readJsonSync = path => JSON.parse(fs.readFileSync(path)) | ||
|
||
const folderPath = 'out' | ||
const { sources } = fs.readJsonSync('sources.json') | ||
|
||
function decode(o) { | ||
if (Array.isArray(o)) return o.map(decode) | ||
if (o?.Texts) return o.Texts.map(t => decodeURIComponent(t.R[0].T)) | ||
return o | ||
} | ||
|
||
function join(a) { | ||
return a.join(' ').replaceAll(' ', ' ').replaceAll('""', '"') | ||
} | ||
|
||
function decjoin(o) { | ||
return decode(o).map(join) | ||
} | ||
|
||
function vd(a) { | ||
a = decjoin(a) | ||
return { | ||
value: a[0], | ||
description: a[1], | ||
} | ||
} | ||
|
||
function formatDate(str) { | ||
let splitted = str.slice(7).replaceAll('. ', '.').split('.') | ||
splitted.reverse() | ||
return splitted.join('-') | ||
} | ||
|
||
const vda = ([value, description]) => ({ value, description }) | ||
|
||
function valDesc(row, i) { | ||
return Array.isArray(row[i]) ? row[i].map(vd) : [vd(row.slice(i))] | ||
} | ||
|
||
sources.forEach(async source => { | ||
let metadata = fs.readJsonSync('sources/adjusted/' + source.split('/').at(-1) + '.json') | ||
let pdf = metadata.desc.split('/').at(-1) | ||
console.log(pdf) | ||
let pdf2tree = new PDF2Tree() | ||
pdf2tree.maxStrokeWidth = 1 | ||
pdf2tree.maxGapWidth = 0.1 | ||
let json = await pdf2tree.loadPDF('sources/descriptions/' + pdf) | ||
let tree = json.Tree.slice(2).flat().filter(n => !n.Texts) | ||
|
||
// Tables that continue on next page | ||
for (let i = 1; i < tree.length; i++) { | ||
let row = 0, col = 0 | ||
if (isFinite(decode(tree[i][row][col])[0])) continue | ||
tree.splice(i - 1, 2, tree[i - 1].concat(tree[i])) | ||
} | ||
|
||
let layers = tree.map(table => { | ||
let result = {} | ||
let row0 = decode(table[0][0]) | ||
let row1 = decode(table[1][0]) | ||
|
||
let includesRules = row0.length != 4 | ||
|
||
result.layer = row0.find(v => v.startsWith('Abgabebeschreibung')).replaceAll(' ', ' ').split(' ')[1] | ||
let version = row0[1] | ||
if (includesRules) version = row1[0] | ||
result.version = formatDate(version) | ||
|
||
let i = 2 | ||
if (includesRules) i = 3 | ||
|
||
if (decode(table[i]).length == 6) includesRules = true | ||
result.datatype = null | ||
|
||
result.columns = table.slice(i).map(row => { | ||
let drow = decjoin(row) | ||
let column = { | ||
column: drow[0].replaceAll(' ', ''), | ||
description: drow[1], | ||
datatype: drow[2], | ||
} | ||
column.values = [] | ||
column.rules = [] | ||
if (includesRules) { | ||
let multipleRules = Array.isArray(row[3]) | ||
let valuesNested = Array.isArray(row[4]) | ||
if (multipleRules) { | ||
if (valuesNested) | ||
column.values = row[4].map(decjoin).map(vda) | ||
|
||
row[3].map(rule => { | ||
let features = decode(rule)[0] | ||
let values = decode(rule)[1] | ||
if (Array.isArray(rule[1])) { | ||
values = decode(rule[1]).map(v => v[0][0]) | ||
if (!valuesNested) column.values.push(...rule[1].map(decjoin).map(vda)) | ||
} | ||
column.rules.push({ | ||
features, | ||
values, | ||
}) | ||
}) | ||
} else { | ||
column.values = valDesc(row, 4) | ||
} | ||
} else { | ||
column.values = valDesc(row, 3) | ||
} | ||
return column | ||
}) | ||
|
||
return result | ||
}) | ||
|
||
layers.forEach(layer => layer.datatype = layer.columns[1].values[0].value) | ||
|
||
let { title, theme, annex, resourceConstraints } = metadata | ||
|
||
let data = { | ||
title, | ||
theme, | ||
annex, | ||
layers, | ||
sources: [metadata.doi, theme], | ||
resourceConstraints, | ||
} | ||
fs.writeFileSync(folderPath + '/' + pdf.slice(0, -3) + 'json', JSON.stringify(data, null, 2)) | ||
}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
import fs from 'fs' | ||
import sqlite3 from 'sqlite3' | ||
|
||
// gpkg files are not automatically downloaded | ||
// you will find the links in sources/adjusted | ||
// then put the files directly in sources/gpkg | ||
|
||
fs.readJsonSync = path => JSON.parse(fs.readFileSync(path)) | ||
|
||
const folderPath = 'out' | ||
const { sources } = fs.readJsonSync('sources.json') | ||
|
||
// choose file you want to validate | ||
let i = 6 | ||
let source = sources[i] | ||
|
||
let metadata = fs.readJsonSync('sources/adjusted/' + source.split('/').at(-1) + '.json') | ||
|
||
let outPath = folderPath + '/' + metadata.desc.split('/').at(-1).slice(0, -4) + '.json' | ||
let desc = fs.readJsonSync(outPath) | ||
|
||
let gpkg = metadata.gpkg.split('/').at(-1) | ||
|
||
let db = new sqlite3.Database('sources/gpkg/' + gpkg, err => { | ||
if (err) return console.error(err.message) | ||
console.log('Connected to db') | ||
desc.layers.forEach(layer => { | ||
let rows = [] | ||
db.each(`pragma table_info('${layer.layer}')`, (err, row) => { | ||
if (err) return console.error(err.message) | ||
rows.push(row) | ||
}, () => { | ||
let names = rows.map(row => row.name) | ||
let errors = 0 | ||
layer.columns.forEach(column => { | ||
if (!names.includes(column.column)) { | ||
errors++ | ||
console.warn('column', column.column, 'in layer', layer.layer, 'missing') | ||
} | ||
}) | ||
if (!errors) console.log('no errors in layer', layer.layer) | ||
}) | ||
}) | ||
db.close() | ||
}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
import fs from 'fs' | ||
|
||
let descriptions = fs.readdirSync('out').map(file => JSON.parse(fs.readFileSync('out/' + file))) | ||
|
||
const sum = a => a.reduce((p, c) => p + c, 0) | ||
|
||
let stats = descriptions.map(description => { | ||
let result = [] | ||
result.push(description.title.split(' - ').at(-1).split(' ')[0]) | ||
result.push(description.layers.length) | ||
result.push(sum(description.layers.map(layer => layer.columns.length))) | ||
result.push(sum(description.layers.map(layer => sum(layer.columns.map(column => column.values.length))))) | ||
result.push(sum(description.layers.map(layer => sum(layer.columns.map(column => column.rules.length))))) | ||
return result | ||
}) | ||
|
||
stats.unshift(['Geopackage', 'Number of Layers', 'Number of Columns', 'Number of Values', 'Number of Rules']) | ||
fs.writeFileSync('stats.csv', stats.map(r => r.join(',')).join('\n')) |
Oops, something went wrong.