-
Notifications
You must be signed in to change notification settings - Fork 20
/
process_raw_data.js
87 lines (77 loc) · 2.81 KB
/
process_raw_data.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import {
xlsx,
readXLSX,
writeCSV,
readCSV,
readJSON
} from "https://deno.land/x/[email protected]/mod.ts";
import { parse as parseYaml } from "https://deno.land/[email protected]/encoding/yaml.ts";
import { parse } from "https://deno.land/[email protected]/encoding/csv.ts";
import { unZipFromFile } from "https://deno.land/x/[email protected]/mod.ts";
import { walk, emptyDir } from "https://deno.land/[email protected]/fs/mod.ts";
import { gunzipFile } from "https://deno.land/x/[email protected]/mod.ts";
let inputFilename = Deno.args[0];
const inputFilenameRoot = inputFilename.split("/")[1].split(".")[0];
const outputFilename = `processed/${inputFilenameRoot}.csv`;
const manifest = await readCSV("data_manifest.csv");
const config = manifest.find((d) => d.name === inputFilenameRoot);
if (!config) Deno.exit();
console.log("config", config);
let fileType = config.format;
let fileContents;
try {
if (fileType === "zip") {
await emptyDir("./tmp");
await unZipFromFile(inputFilename, "./tmp");
for await (const file of walk("./tmp")) {
const extension = file.path.split(".").pop();
if (
["xlsx", "csv", "csv.gz", "tsv", "yaml", "json"].includes(extension)
) {
fileType = extension;
inputFilename = file.path;
break;
}
}
}
if (fileType === "csv") {
fileContents = await readCSV(inputFilename);
} else if (fileType === "tsv") {
fileContents = await readCSV(inputFilename, {
separator: "\t",
});
} else if (fileType === "csv.gz") {
await gunzipFile(inputFilename, outputFilename);
Deno.exit();
} else if (fileType === "json") {
fileContents = await readJSON(inputFilename);
} else if (fileType === "yaml") {
const yaml = await Deno.readFile(inputFilename);
const decoder = new TextDecoder("utf-8");
fileContents = await parseYaml(decoder.decode(yaml));
} else if (fileType === "xlsx") {
const workbook = await readXLSX(inputFilename);
const sheetData =
workbook.Sheets[workbook.SheetNames[config["Excel sheet #"] - 1]];
const csvString = await xlsx.utils.sheet_to_csv(sheetData); // can use to_json, to_txt, to_html, to_formulae
const rows = csvString.split("\n");
const filteredRows = rows.slice((config["Excel sheet top row #"] || 1) - 1);
const data = await parse(filteredRows.join("\n"), { skipFirstRow: true });
const columns = filteredRows[0].split(",");
const countryKey = config["Country Column #"]
? columns[config["Country Column #"] - 1]
: "";
const yearKey = config["Year Column #"]
? columns[config["Year Column #"] - 1]
: "";
fileContents = data.map((d) => ({
...d,
data_country: d[countryKey],
data_year: d[yearKey],
}));
}
} catch (e) {
console.log(e);
}
await emptyDir("./tmp");
if (fileContents) await writeCSV(outputFilename, fileContents);