-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlambda-ocr.js
143 lines (118 loc) · 3.59 KB
/
lambda-ocr.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
var AWS = require('aws-sdk');
var path = require('path');
var uuid = require('node-uuid');
var Promise = require('bluebird');
var ocr = require('./ocr');
var signedRequest = require('./signedRequest');
var s3 = new AWS.S3();
exports.handler = function (event, context) {
event.Records.forEach(function (record) {
var bucket = record.s3.bucket.name;
var urlencodedKey = record.s3.object.key;
console.log('event from bucket', bucket, 'with object key', urlencodedKey);
var key = decodeURIComponent(urlencodedKey.replace(/\+/g, ' '));
if (key.indexOf("/") === -1) {
context.fail("All comics need to go in a folder. The folder is used as the doctype in ES");
}
var splitKey = key.split("/");
var bucketFolder = splitKey[0];
var bucketFilename = splitKey[1];
var imageUuid = uuid.v4();
var destination = {
bucket: "bekkops-comicsearcher-www",
key: "comics" + "/" + imageUuid
};
var eventObject = { Bucket: bucket, Key: key };
var imageStream = s3.getObject(eventObject).createReadStream();
runOcr(imageStream)
.then(copyS3Object({
CopySource: bucket + '/' + urlencodedKey,
Bucket: destination.bucket,
Key: destination.key
}))
.then(postToEs({
uuid: imageUuid,
url: "https://s3-eu-west-1.amazonaws.com/" + destination.bucket + "/" + destination.key,
filename: bucketFilename,
doctype: bucketFolder
}))
.then(deleteS3Object(eventObject))
.then(function (results) {
console.log(JSON.stringify(results));
context.succeed();
})
.catch(function (err) {
console.error(err);
context.fail();
});
});
};
function runOcr (imageS3Stream) {
return new Promise(function (resolve, reject) {
ocr.events.on("authed", function () {
ocr.runOcr(imageS3Stream, function (err, text) {
if (err) {
return reject(err);
}
var results = { ocrText: text };
return resolve(results);
});
});
});
}
function copyS3Object (params) {
var paramsStr = JSON.stringify(params);
return function (results) {
return new Promise(function (resolve, reject) {
s3.copyObject(params, function (err, data) {
if (err) {
var errMsg = "Could not move file " + paramsStr;
console.error(err, errMsg);
return reject(errMsg, err);
}
console.log("Moved file", paramsStr);
return resolve(results);
});
});
};
}
function postToEs (params) {
return function (results) {
var doc = {
uuid: params.uuid,
url: params.url,
filename: params.filename,
ocrtext: results.ocrText,
correctedtext: cleanOcrText(results.ocrText),
ocrtext_raw: results.ocrText
};
var url = path.join('/', "comics", params.doctype);
return signedRequest.post(url, doc)
.then(function (body) {
results.esResponse = body;
return results;
});
};
}
function deleteS3Object (params) {
var paramsStr = JSON.stringify(params);
return function (results) {
return new Promise(function (resolve, reject) {
s3.deleteObject(params, function (err, data) {
if (err) {
var errMsg = "Could not delete" + paramsStr;
console.error(err, errMsg);
return reject(errMsg, err);
}
console.log("Deleted uploaded", paramsStr);
return resolve(results);
});
});
};
}
function cleanOcrText (dirtyText) {
return dirtyText
.replace(/\r\n/g, " ")
.replace(/[^a-zA-ZæøåÆØÅ0-9\s]/g, " ")
.replace(/\s+/g, " ");
}