-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathprocessCorpora.js
75 lines (64 loc) · 2.09 KB
/
processCorpora.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
const fs = require('fs');
const path = require('path');
const { countFileLines, promptLoop } = require('./helpers');
const { saveStatsToDisk } = require('./processStats');
/**
* Main function to wait for user to run corpora-creator separately
* (It can't be a spawned process because it eats too much memory)
*
* @param {string} releaseName name of current release
*/
const processCorpora = async (releaseName) => {
const releaseDir = path.join(__dirname, releaseName);
const tsvPath = path.join(releaseDir, 'clips.tsv');
const query = `In a separate shell, run the following command:
create-corpora -f ${tsvPath} -d ${releaseDir} -v\n
When that has completed, return to this shell and type 'corpora-complete' and hit enter > `;
await promptLoop(query, {
'corpora-complete': () => {
return;
},
});
};
/**
* Helper function to create stats object with test/dev/train bucket ocunts
*
* @param {array} releaseLocales array of locale names
* @param {string} releaseName name of current release
*
* @return {Object} stats object with locale-key and bucket linecount
*/
const countBuckets = async (releaseLocales, releaseName) => {
const buckets = {};
for (const locale of releaseLocales) {
const localePath = path.join(releaseName, locale);
// Count number of lines in each TSV file for each locale
const localeBuckets = (await fs.readdirSync(localePath))
.filter((file) => file.endsWith('.tsv'))
.map(async (fileName) => [
fileName,
Math.max(
(await countFileLines(path.join(localePath, fileName))) - 1,
0,
),
]);
// Reduce localeBuckets to locale object to match stats formatting
buckets[locale] = {
buckets: (await Promise.all(localeBuckets)).reduce(
(obj, [key, count]) => {
const newObj = obj;
newObj[key.split('.tsv')[0]] = count;
return newObj;
},
{},
),
};
// Load and save stats to disk
saveStatsToDisk(releaseName, { locales: buckets });
}
return buckets;
};
module.exports = {
countBuckets,
processCorpora,
};