Skip to content

Commit

Permalink
Added project files
Browse files Browse the repository at this point in the history
  • Loading branch information
jogemu authored May 14, 2023
1 parent 02fbc99 commit 650027f
Show file tree
Hide file tree
Showing 10 changed files with 526 additions and 0 deletions.
32 changes: 32 additions & 0 deletions 00fetchMetadata.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import fs from 'fs'
import https from 'https'

fs.readJsonSync = path => JSON.parse(fs.readFileSync(path))

const folderPath = 'sources/metadata'
const { sources } = fs.readJsonSync('sources.json')

const download = source => new Promise((resolve, reject) => {
const outputPath = folderPath + '/' + source.split('/').at(-1) + '.xml'
const file = fs.createWriteStream(outputPath)

https.get(source, res => {
if (res.statusCode === 302) https.get(res.headers.location, res => {
if (res.statusCode === 200) res.pipe(file).on('close', resolve)
else reject(res.statusCode)
res.resume()
})
else reject(res.statusCode)
res.resume()
})
})

folderPath.split('/').forEach((_, index, array) => {
let folder = array.slice(0, index + 1).join('/')
if (!fs.existsSync(folder)) fs.mkdirSync(folder)
})

sources.forEach(async (source, index) => {
await download(source)
console.log(index + 1, '/', sources.length, 'downloaded')
})
69 changes: 69 additions & 0 deletions 01adjustMetadata.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import fs from 'fs'
import Parser from 'xml2js'
import https from 'https'

fs.readJsonSync = path => JSON.parse(fs.readFileSync(path))

const folderPath = 'sources/adjusted'
const { sources } = fs.readJsonSync('sources.json')

function get(o, ...keys) {
keys.forEach(key => o = o[key][0])
return o
}

folderPath.split('/').forEach((_, index, array) => {
let folder = array.slice(0, index + 1).join('/')
if (!fs.existsSync(folder)) fs.mkdirSync(folder)
})

const annex = theme => new Promise((resolve, reject) => {
https.get(theme + '/' + theme.slice('/').at(-1) + '.en.json', (res) => {
if (res.statusCode !== 200) return reject(res.statusMessage)
let body = []

res.on('data', data => body.push(data.toString()))
res.on('end', () => resolve(JSON.parse(body.join()).theme.annex))
});
})

sources.forEach(source => {
Parser.parseString(fs.readFileSync('sources/metadata/' + source.split('/').at(-1) + '.xml'), (err, data) => {
if (err) return console.error(err)
let metadata = data['gmd:MD_Metadata']

let citation = get(metadata, 'gmd:identificationInfo', 'gmd:MD_DataIdentification', 'gmd:citation', 'gmd:CI_Citation')
let title = get(citation, 'gmd:title', 'gco:CharacterString')
let doi = get(citation, 'gmd:identifier', 'gmd:MD_Identifier', 'gmd:code', 'gmx:Anchor')['$']['xlink:href']

let identificationInfo = get(metadata, 'gmd:identificationInfo', 'gmd:MD_DataIdentification')
let descriptiveKeywords = identificationInfo['gmd:descriptiveKeywords']
.map(keyword => get(keyword, 'gmd:MD_Keywords', 'gmd:keyword'))
.filter(keyword => keyword['gmx:Anchor'])
.map(keyword => get(keyword, 'gmx:Anchor')['$']['xlink:href'])

let resourceConstraints = identificationInfo['gmd:resourceConstraints']
.map(keyword => get(keyword, 'gmd:MD_LegalConstraints', 'gmd:otherConstraints', 'gmx:Anchor')['$']['xlink:href'])

let theme = descriptiveKeywords.find(keyword => keyword.startsWith('http://inspire.ec.europa.eu/theme/'))
// manually verfied that all links support https
theme = theme.replace('http', 'https')

let onLines = get(metadata, 'gmd:distributionInfo', 'gmd:MD_Distribution', 'gmd:transferOptions', 'gmd:MD_DigitalTransferOptions')['gmd:onLine']
.map(onLine => get(onLine, 'gmd:CI_OnlineResource', 'gmd:linkage', 'gmd:URL'))

let desc = onLines.find(onLine => onLine.endsWith('.pdf') && onLine.includes('gpkg'))
let gpkg = onLines.find(onLine => onLine.endsWith('.gpkg'))

annex(theme).then(annex => {
fs.writeFileSync(folderPath + '/' + source.split('/').at(-1) + '.json', JSON.stringify({
title, doi,
theme,
annex,
desc,
gpkg,
resourceConstraints,
}, null, 2))
})
})
})
28 changes: 28 additions & 0 deletions 02fetchDescriptions.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import fs from 'fs'
import https from 'https'

fs.readJsonSync = path => JSON.parse(fs.readFileSync(path))

const folderPath = 'sources/descriptions'
const { sources } = fs.readJsonSync('sources.json')

const download = url => new Promise((resolve, reject) => {
const outputPath = folderPath + '/' + url.split('/').at(-1)
const file = fs.createWriteStream(outputPath)

https.get(url, res => {
if (res.statusCode === 200) res.pipe(file).on('close', resolve)
else reject(res.statusCode)
res.resume()
})
})

folderPath.split('/').forEach((_, index, array) => {
let folder = array.slice(0, index + 1).join('/')
if (!fs.existsSync(folder)) fs.mkdirSync(folder)
})

sources.forEach(source => {
let metadata = fs.readJsonSync('sources/adjusted/' + source.split('/').at(-1) + '.json')
download(metadata.desc)
})
131 changes: 131 additions & 0 deletions 03generate.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
import fs from 'fs'
import PDF2Tree from 'pdf2tree'

fs.readJsonSync = path => JSON.parse(fs.readFileSync(path))

const folderPath = 'out'
const { sources } = fs.readJsonSync('sources.json')

function decode(o) {
if (Array.isArray(o)) return o.map(decode)
if (o?.Texts) return o.Texts.map(t => decodeURIComponent(t.R[0].T))
return o
}

function join(a) {
return a.join(' ').replaceAll(' ', ' ').replaceAll('""', '"')
}

function decjoin(o) {
return decode(o).map(join)
}

function vd(a) {
a = decjoin(a)
return {
value: a[0],
description: a[1],
}
}

function formatDate(str) {
let splitted = str.slice(7).replaceAll('. ', '.').split('.')
splitted.reverse()
return splitted.join('-')
}

const vda = ([value, description]) => ({ value, description })

function valDesc(row, i) {
return Array.isArray(row[i]) ? row[i].map(vd) : [vd(row.slice(i))]
}

sources.forEach(async source => {
let metadata = fs.readJsonSync('sources/adjusted/' + source.split('/').at(-1) + '.json')
let pdf = metadata.desc.split('/').at(-1)
console.log(pdf)
let pdf2tree = new PDF2Tree()
pdf2tree.maxStrokeWidth = 1
pdf2tree.maxGapWidth = 0.1
let json = await pdf2tree.loadPDF('sources/descriptions/' + pdf)
let tree = json.Tree.slice(2).flat().filter(n => !n.Texts)

// Tables that continue on next page
for (let i = 1; i < tree.length; i++) {
let row = 0, col = 0
if (isFinite(decode(tree[i][row][col])[0])) continue
tree.splice(i - 1, 2, tree[i - 1].concat(tree[i]))
}

let layers = tree.map(table => {
let result = {}
let row0 = decode(table[0][0])
let row1 = decode(table[1][0])

let includesRules = row0.length != 4

result.layer = row0.find(v => v.startsWith('Abgabebeschreibung')).replaceAll(' ', ' ').split(' ')[1]
let version = row0[1]
if (includesRules) version = row1[0]
result.version = formatDate(version)

let i = 2
if (includesRules) i = 3

if (decode(table[i]).length == 6) includesRules = true
result.datatype = null

result.columns = table.slice(i).map(row => {
let drow = decjoin(row)
let column = {
column: drow[0].replaceAll(' ', ''),
description: drow[1],
datatype: drow[2],
}
column.values = []
column.rules = []
if (includesRules) {
let multipleRules = Array.isArray(row[3])
let valuesNested = Array.isArray(row[4])
if (multipleRules) {
if (valuesNested)
column.values = row[4].map(decjoin).map(vda)

row[3].map(rule => {
let features = decode(rule)[0]
let values = decode(rule)[1]
if (Array.isArray(rule[1])) {
values = decode(rule[1]).map(v => v[0][0])
if (!valuesNested) column.values.push(...rule[1].map(decjoin).map(vda))
}
column.rules.push({
features,
values,
})
})
} else {
column.values = valDesc(row, 4)
}
} else {
column.values = valDesc(row, 3)
}
return column
})

return result
})

layers.forEach(layer => layer.datatype = layer.columns[1].values[0].value)

let { title, theme, annex, resourceConstraints } = metadata

let data = {
title,
theme,
annex,
layers,
sources: [metadata.doi, theme],
resourceConstraints,
}
fs.writeFileSync(folderPath + '/' + pdf.slice(0, -3) + 'json', JSON.stringify(data, null, 2))
})
45 changes: 45 additions & 0 deletions 04validate.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import fs from 'fs'
import sqlite3 from 'sqlite3'

// gpkg files are not automatically downloaded
// you will find the links in sources/adjusted
// then put the files directly in sources/gpkg

fs.readJsonSync = path => JSON.parse(fs.readFileSync(path))

const folderPath = 'out'
const { sources } = fs.readJsonSync('sources.json')

// choose file you want to validate
let i = 6
let source = sources[i]

let metadata = fs.readJsonSync('sources/adjusted/' + source.split('/').at(-1) + '.json')

let outPath = folderPath + '/' + metadata.desc.split('/').at(-1).slice(0, -4) + '.json'
let desc = fs.readJsonSync(outPath)

let gpkg = metadata.gpkg.split('/').at(-1)

let db = new sqlite3.Database('sources/gpkg/' + gpkg, err => {
if (err) return console.error(err.message)
console.log('Connected to db')
desc.layers.forEach(layer => {
let rows = []
db.each(`pragma table_info('${layer.layer}')`, (err, row) => {
if (err) return console.error(err.message)
rows.push(row)
}, () => {
let names = rows.map(row => row.name)
let errors = 0
layer.columns.forEach(column => {
if (!names.includes(column.column)) {
errors++
console.warn('column', column.column, 'in layer', layer.layer, 'missing')
}
})
if (!errors) console.log('no errors in layer', layer.layer)
})
})
db.close()
})
18 changes: 18 additions & 0 deletions 05stats.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import fs from 'fs'

let descriptions = fs.readdirSync('out').map(file => JSON.parse(fs.readFileSync('out/' + file)))

const sum = a => a.reduce((p, c) => p + c, 0)

let stats = descriptions.map(description => {
let result = []
result.push(description.title.split(' - ').at(-1).split(' ')[0])
result.push(description.layers.length)
result.push(sum(description.layers.map(layer => layer.columns.length)))
result.push(sum(description.layers.map(layer => sum(layer.columns.map(column => column.values.length)))))
result.push(sum(description.layers.map(layer => sum(layer.columns.map(column => column.rules.length)))))
return result
})

stats.unshift(['Geopackage', 'Number of Layers', 'Number of Columns', 'Number of Values', 'Number of Rules'])
fs.writeFileSync('stats.csv', stats.map(r => r.join(',')).join('\n'))
Loading

0 comments on commit 650027f

Please sign in to comment.