Added project files

jogemu · May 14, 2023 · 650027f · 650027f
1 parent 02fbc99
commit 650027f
Show file tree

Hide file tree

Showing 10 changed files with 526 additions and 0 deletions.
diff --git a/00fetchMetadata.js b/00fetchMetadata.js
@@ -0,0 +1,32 @@
+import fs from 'fs'
+import https from 'https'
+
+fs.readJsonSync = path => JSON.parse(fs.readFileSync(path))
+
+const folderPath = 'sources/metadata'
+const { sources } = fs.readJsonSync('sources.json')
+
+const download = source => new Promise((resolve, reject) => {
+  const outputPath = folderPath + '/' + source.split('/').at(-1) + '.xml'
+  const file = fs.createWriteStream(outputPath)
+
+  https.get(source, res => {
+    if (res.statusCode === 302) https.get(res.headers.location, res => {
+      if (res.statusCode === 200) res.pipe(file).on('close', resolve)
+      else reject(res.statusCode)
+      res.resume()
+    })
+    else reject(res.statusCode)
+    res.resume()
+  })
+})
+
+folderPath.split('/').forEach((_, index, array) => {
+  let folder = array.slice(0, index + 1).join('/')
+  if (!fs.existsSync(folder)) fs.mkdirSync(folder)
+})
+
+sources.forEach(async (source, index) => {
+  await download(source)
+  console.log(index + 1, '/', sources.length, 'downloaded')
+})
diff --git a/01adjustMetadata.js b/01adjustMetadata.js
@@ -0,0 +1,69 @@
+import fs from 'fs'
+import Parser from 'xml2js'
+import https from 'https'
+
+fs.readJsonSync = path => JSON.parse(fs.readFileSync(path))
+
+const folderPath = 'sources/adjusted'
+const { sources } = fs.readJsonSync('sources.json')
+
+function get(o, ...keys) {
+  keys.forEach(key => o = o[key][0])
+  return o
+}
+
+folderPath.split('/').forEach((_, index, array) => {
+  let folder = array.slice(0, index + 1).join('/')
+  if (!fs.existsSync(folder)) fs.mkdirSync(folder)
+})
+
+const annex = theme => new Promise((resolve, reject) => {
+  https.get(theme + '/' + theme.slice('/').at(-1) + '.en.json', (res) => {
+    if (res.statusCode !== 200) return reject(res.statusMessage)
+    let body = []
+
+    res.on('data', data => body.push(data.toString()))
+    res.on('end', () => resolve(JSON.parse(body.join()).theme.annex))
+  });
+})
+
+sources.forEach(source => {
+  Parser.parseString(fs.readFileSync('sources/metadata/' + source.split('/').at(-1) + '.xml'), (err, data) => {
+    if (err) return console.error(err)
+    let metadata = data['gmd:MD_Metadata']
+
+    let citation = get(metadata, 'gmd:identificationInfo', 'gmd:MD_DataIdentification', 'gmd:citation', 'gmd:CI_Citation')
+    let title = get(citation, 'gmd:title', 'gco:CharacterString')
+    let doi = get(citation, 'gmd:identifier', 'gmd:MD_Identifier', 'gmd:code', 'gmx:Anchor')['$']['xlink:href']
+
+    let identificationInfo = get(metadata, 'gmd:identificationInfo', 'gmd:MD_DataIdentification')
+    let descriptiveKeywords = identificationInfo['gmd:descriptiveKeywords']
+      .map(keyword => get(keyword, 'gmd:MD_Keywords', 'gmd:keyword'))
+      .filter(keyword => keyword['gmx:Anchor'])
+      .map(keyword => get(keyword, 'gmx:Anchor')['$']['xlink:href'])
+
+    let resourceConstraints = identificationInfo['gmd:resourceConstraints']
+      .map(keyword => get(keyword, 'gmd:MD_LegalConstraints', 'gmd:otherConstraints', 'gmx:Anchor')['$']['xlink:href'])
+
+    let theme = descriptiveKeywords.find(keyword => keyword.startsWith('http://inspire.ec.europa.eu/theme/'))
+    // manually verfied that all links support https
+    theme = theme.replace('http', 'https')
+
+    let onLines = get(metadata, 'gmd:distributionInfo', 'gmd:MD_Distribution', 'gmd:transferOptions', 'gmd:MD_DigitalTransferOptions')['gmd:onLine']
+      .map(onLine => get(onLine, 'gmd:CI_OnlineResource', 'gmd:linkage', 'gmd:URL'))
+
+    let desc = onLines.find(onLine => onLine.endsWith('.pdf') && onLine.includes('gpkg'))
+    let gpkg = onLines.find(onLine => onLine.endsWith('.gpkg'))
+
+    annex(theme).then(annex => {
+      fs.writeFileSync(folderPath + '/' + source.split('/').at(-1) + '.json', JSON.stringify({
+        title, doi,
+        theme,
+        annex,
+        desc,
+        gpkg,
+        resourceConstraints,
+      }, null, 2))
+    })
+  })
+})
diff --git a/02fetchDescriptions.js b/02fetchDescriptions.js
@@ -0,0 +1,28 @@
+import fs from 'fs'
+import https from 'https'
+
+fs.readJsonSync = path => JSON.parse(fs.readFileSync(path))
+
+const folderPath = 'sources/descriptions'
+const { sources } = fs.readJsonSync('sources.json')
+
+const download = url => new Promise((resolve, reject) => {
+  const outputPath = folderPath + '/' + url.split('/').at(-1)
+  const file = fs.createWriteStream(outputPath)
+
+  https.get(url, res => {
+    if (res.statusCode === 200) res.pipe(file).on('close', resolve)
+    else reject(res.statusCode)
+    res.resume()
+  })
+})
+
+folderPath.split('/').forEach((_, index, array) => {
+  let folder = array.slice(0, index + 1).join('/')
+  if (!fs.existsSync(folder)) fs.mkdirSync(folder)
+})
+
+sources.forEach(source => {
+  let metadata = fs.readJsonSync('sources/adjusted/' + source.split('/').at(-1) + '.json')
+  download(metadata.desc)
+})
diff --git a/03generate.js b/03generate.js
@@ -0,0 +1,131 @@
+import fs from 'fs'
+import PDF2Tree from 'pdf2tree'
+
+fs.readJsonSync = path => JSON.parse(fs.readFileSync(path))
+
+const folderPath = 'out'
+const { sources } = fs.readJsonSync('sources.json')
+
+function decode(o) {
+  if (Array.isArray(o)) return o.map(decode)
+  if (o?.Texts) return o.Texts.map(t => decodeURIComponent(t.R[0].T))
+  return o
+}
+
+function join(a) {
+  return a.join(' ').replaceAll('  ', ' ').replaceAll('""', '"')
+}
+
+function decjoin(o) {
+  return decode(o).map(join)
+}
+
+function vd(a) {
+  a = decjoin(a)
+  return {
+    value: a[0],
+    description: a[1],
+  }
+}
+
+function formatDate(str) {
+  let splitted = str.slice(7).replaceAll('. ', '.').split('.')
+  splitted.reverse()
+  return splitted.join('-')
+}
+
+const vda = ([value, description]) => ({ value, description })
+
+function valDesc(row, i) {
+  return Array.isArray(row[i]) ? row[i].map(vd) : [vd(row.slice(i))]
+}
+
+sources.forEach(async source => {
+  let metadata = fs.readJsonSync('sources/adjusted/' + source.split('/').at(-1) + '.json')
+  let pdf = metadata.desc.split('/').at(-1)
+  console.log(pdf)
+  let pdf2tree = new PDF2Tree()
+  pdf2tree.maxStrokeWidth = 1
+  pdf2tree.maxGapWidth = 0.1
+  let json = await pdf2tree.loadPDF('sources/descriptions/' + pdf)
+  let tree = json.Tree.slice(2).flat().filter(n => !n.Texts)
+
+  // Tables that continue on next page
+  for (let i = 1; i < tree.length; i++) {
+    let row = 0, col = 0
+    if (isFinite(decode(tree[i][row][col])[0])) continue
+    tree.splice(i - 1, 2, tree[i - 1].concat(tree[i]))
+  }
+
+  let layers = tree.map(table => {
+    let result = {}
+    let row0 = decode(table[0][0])
+    let row1 = decode(table[1][0])
+
+    let includesRules = row0.length != 4
+
+    result.layer = row0.find(v => v.startsWith('Abgabebeschreibung')).replaceAll('   ', '  ').split('  ')[1]
+    let version = row0[1]
+    if (includesRules) version = row1[0]
+    result.version = formatDate(version)
+
+    let i = 2
+    if (includesRules) i = 3
+
+    if (decode(table[i]).length == 6) includesRules = true
+    result.datatype = null
+
+    result.columns = table.slice(i).map(row => {
+      let drow = decjoin(row)
+      let column = {
+        column: drow[0].replaceAll(' ', ''),
+        description: drow[1],
+        datatype: drow[2],
+      }
+      column.values = []
+      column.rules = []
+      if (includesRules) {
+        let multipleRules = Array.isArray(row[3])
+        let valuesNested = Array.isArray(row[4])
+        if (multipleRules) {
+          if (valuesNested)
+            column.values = row[4].map(decjoin).map(vda)
+
+          row[3].map(rule => {
+            let features = decode(rule)[0]
+            let values = decode(rule)[1]
+            if (Array.isArray(rule[1])) {
+              values = decode(rule[1]).map(v => v[0][0])
+              if (!valuesNested) column.values.push(...rule[1].map(decjoin).map(vda))
+            }
+            column.rules.push({
+              features,
+              values,
+            })
+          })
+        } else {
+          column.values = valDesc(row, 4)
+        }
+      } else {
+        column.values = valDesc(row, 3)
+      }
+      return column
+    })
+
+    return result
+  })
+
+  layers.forEach(layer => layer.datatype = layer.columns[1].values[0].value)
+
+  let { title, theme, annex, resourceConstraints } = metadata
+
+  let data = {
+    title,
+    theme,
+    annex,
+    layers,
+    sources: [metadata.doi, theme],
+    resourceConstraints,
+  }
+  fs.writeFileSync(folderPath + '/' + pdf.slice(0, -3) + 'json', JSON.stringify(data, null, 2))
+})
diff --git a/04validate.js b/04validate.js
@@ -0,0 +1,45 @@
+import fs from 'fs'
+import sqlite3 from 'sqlite3'
+
+// gpkg files are not automatically downloaded
+// you will find the links in sources/adjusted
+// then put the files directly in sources/gpkg
+
+fs.readJsonSync = path => JSON.parse(fs.readFileSync(path))
+
+const folderPath = 'out'
+const { sources } = fs.readJsonSync('sources.json')
+
+// choose file you want to validate
+let i = 6
+let source = sources[i]
+
+let metadata = fs.readJsonSync('sources/adjusted/' + source.split('/').at(-1) + '.json')
+
+let outPath = folderPath + '/' + metadata.desc.split('/').at(-1).slice(0, -4) + '.json'
+let desc = fs.readJsonSync(outPath)
+
+let gpkg = metadata.gpkg.split('/').at(-1)
+
+let db = new sqlite3.Database('sources/gpkg/' + gpkg, err => {
+  if (err) return console.error(err.message)
+  console.log('Connected to db')
+  desc.layers.forEach(layer => {
+    let rows = []
+    db.each(`pragma table_info('${layer.layer}')`, (err, row) => {
+      if (err) return console.error(err.message)
+      rows.push(row)
+    }, () => {
+      let names = rows.map(row => row.name)
+      let errors = 0
+      layer.columns.forEach(column => {
+        if (!names.includes(column.column)) {
+          errors++
+          console.warn('column', column.column, 'in layer', layer.layer, 'missing')
+        }
+      })
+      if (!errors) console.log('no errors in layer', layer.layer)
+    })
+  })
+  db.close()
+})
diff --git a/05stats.js b/05stats.js
@@ -0,0 +1,18 @@
+import fs from 'fs'
+
+let descriptions = fs.readdirSync('out').map(file => JSON.parse(fs.readFileSync('out/' + file)))
+
+const sum = a => a.reduce((p, c) => p + c, 0)
+
+let stats = descriptions.map(description => {
+  let result = []
+  result.push(description.title.split(' - ').at(-1).split(' ')[0])
+  result.push(description.layers.length)
+  result.push(sum(description.layers.map(layer => layer.columns.length)))
+  result.push(sum(description.layers.map(layer => sum(layer.columns.map(column => column.values.length)))))
+  result.push(sum(description.layers.map(layer => sum(layer.columns.map(column => column.rules.length)))))
+  return result
+})
+
+stats.unshift(['Geopackage', 'Number of Layers', 'Number of Columns', 'Number of Values', 'Number of Rules'])
+fs.writeFileSync('stats.csv', stats.map(r => r.join(',')).join('\n'))