-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpost-process.ts
91 lines (59 loc) · 2.3 KB
/
post-process.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
"use strict";
import path from 'path';
import fs from 'fs/promises';
import util from 'util';
import { protos } from '@google-cloud/vision';
import { isDefined } from './common';
util.inspect.defaultOptions.depth = Infinity;
const run = async (ocrOutputDir: string, pagesDir: string) => {
console.log(`ocrOutputDir = ${ocrOutputDir}`);
console.log(`pagesDir = ${pagesDir}`);
const pageFileNameBase = path.join(pagesDir, 'page-');
const getPageFileName = (pageNumber: number) =>
`${pageFileNameBase}${pageNumber.toString().padStart(4, '0')}.txt`;
const files = await fs.readdir(ocrOutputDir);
const jsonFiles = files.filter(name => name.endsWith('.json'));
// ensure output dir exists or create it (including any parent dirs if needed)
await fs.mkdir(pagesDir, { recursive: true });
for (const file of jsonFiles) {
console.log(`processing ${file}`);
const dataString = await fs.readFile(path.join(ocrOutputDir, file), {
encoding: 'utf-8',
});
const data: protos.google.cloud.vision.v1.IAnnotateFileResponse = JSON.parse(dataString);
if (!isDefined(data.responses)) {
throw new Error(`${file}: responses field not defined`);
}
console.log(` - total responses length = ${data.responses.length}`);
for (let i = 0; i < data.responses.length; i++) {
const response = data.responses[i];
const pageNumber = response.context?.pageNumber;
const text = response.fullTextAnnotation?.text;
if (!isDefined(pageNumber)) {
throw new Error(`${file}: responses[${i}].context.pageNumber is undefined`);
}
if (!isDefined(text)) {
console.log(` > an empty page ${pageNumber} in detected (${file}: responses[${i}].fullTextAnnotation.text is undefined)`);
}
const outputFile = getPageFileName(pageNumber);
await fs.writeFile(outputFile, text ?? '');
console.log(` > written ${outputFile}`);
}
}
console.log(`finished`);
};
// process.argv[0] - path to node (Node.js interpreter)
// process.argv[1] - path to script
if (!isDefined(process.argv[2]) || !isDefined(process.argv[3])) {
console.error('usage: {ocrOutputDir} {pagesDir}');
process.exit(1);
}
run(process.argv[2], process.argv[3])
.then(() => {
console.log('script finished');
process.exit(0);
})
.catch(err => {
console.error('an error occurred while running script', err);
process.exit(1);
});