-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathduplicates.js
63 lines (54 loc) · 1.52 KB
/
duplicates.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
var AWS = require('aws-sdk');
var config = require('./config.json');
var endpoint = new AWS.Endpoint(config.esUrl);
var doc = {
"query": {
"match_all": {},
},
"size": 5000
};
var doc = {
"query": {
"multi_match": {
"fields": ["ocrtext", "correctedtext"],
"query": "hms",
"minimum_should_match": "80%"
}
}
}
function findduplicates (results) {
counter = {}
results = JSON.parse(results);
results.hits.hits.forEach(function(obj){
key = obj._source.ocrtext;
if(counter[key]){
counter[key].push({uuid: obj._source.uuid, name: obj._source.filename});
} else {
counter[key] = [obj._source.uuid];
}
});
var vals = Object.keys(counter).map(function (key) {
return counter[key];
});
var filtered = vals.filter(function(val){ return ( val.length > 1 && val.length < 5)}); //Only got comics from 4 sources
console.log("Duplicates found: " + filtered.length)
console.log(filtered); //Only got comics from 4 sources
}
var req = new AWS.HttpRequest(endpoint);
req.path = "/_search";
req.body = JSON.stringify(doc);
req.method = 'POST';
req.region = "eu-west-1";
req.headers['presigned-expires'] = false;
req.headers['Host'] = endpoint.host;
var send = new AWS.NodeHttpClient();
send.handleRequest(req, null, function (httpResp) {
var body = '';
httpResp.on('data', function (chunk) {
body += chunk;
});
httpResp.on('end', function () {
console.log(body);
// findduplicates(body);
});
});