Merge pull request #426 from Gapminder/fix-dup-datapoint

fixed(rules): error in datapoint duplications
Gapminder · Jun 14, 2017 · 2227b28 · 2227b28
2 parents baeda45 + 0d24466
commit 2227b28
Show file tree

Hide file tree

Showing 8 changed files with 70 additions and 70 deletions.
diff --git a/.gitignore b/.gitignore
@@ -7,3 +7,4 @@ npm-debug.log
 coverage
 _results
 _results-test
+package-lock.json
diff --git a/package.json b/package.json
@@ -12,7 +12,7 @@
     "test-travis": "istanbul cover mocha _mocha -- -R spec --timeout 200000 --compilers ts:ts-node/register --recursive test/**/*.spec.ts && codecov",
     "changelog": "conventional-changelog -i CHANGELOG.md -s -p angular",
     "github-release": "conventional-github-releaser -p angular",
-    "build": "tsc && touch lib/package.json && echo \\{\\\"version\\\": \\\"1.6.1\\\"\\} > lib/package.json",
+    "build": "tsc && touch lib/package.json && echo \\{\\\"version\\\": \\\"1.6.2\\\"\\} > lib/package.json",
     "prepublish": "npm run build",
     "preversion": "npm test",
     "version": "npm run changelog && git add CHANGELOG.md",

diff --git a/src/ddf-rules/data-point-rules/duplicated-data-point-key.ts b/src/ddf-rules/data-point-rules/duplicated-data-point-key.ts
@@ -2,58 +2,54 @@ import { uniq, isEmpty, difference } from 'lodash';
 import { DUPLICATED_DATA_POINT_KEY } from '../registry';
 import { Issue } from '../issue';
 
-const md5 = require('md5');
+let storage: any;
 
-let storage: Map<string, any> = new Map();
-
-const initRuleInStorage = (ruleKey: string) => {
-  if (!storage.has(ruleKey)) {
-    storage.set(ruleKey, {
-      hash: new Set(),
-      duplicatedHashes: [],
-      content: {}
-    });
+const initStorage = () => {
+  if (storage && storage.hash) {
+    storage.hash.clear();
   }
+
+  storage = {
+    hash: new Set(),
+    duplicatedHashes: [],
+    content: {}
+  };
 };
 
+initStorage();
+
 export const rule = {
   resetStorage: () => {
-    storage.clear();
+    initStorage();
   },
-  aggregateRecord: (dataPointDescriptor, ruleKey) => {
-    const ruleKeyString = Symbol.keyFor(ruleKey);
-
-    initRuleInStorage(ruleKeyString);
-
-    const keyData = dataPointDescriptor.fileDescriptor.primaryKey.map(keyPart => dataPointDescriptor.record[keyPart]).join(',');
-    const indicatorKey = difference(dataPointDescriptor.fileDescriptor.headers, dataPointDescriptor.fileDescriptor.primaryKey).join(',');
-    const recordHash = `${keyData}@${indicatorKey}`;
-
-    if (storage.get(ruleKeyString).hash.has(recordHash)) {
-      storage.get(ruleKeyString).duplicatedHashes.push(recordHash);
+  aggregateRecord: (dataPointDescriptor) => {
+    const sortedPrimaryKey = dataPointDescriptor.fileDescriptor.primaryKey.sort();
+    const dimensionData = sortedPrimaryKey.map(keyPart => `${keyPart}:${dataPointDescriptor.record[keyPart]}`).join(',');
+    const indicatorName = difference(dataPointDescriptor.fileDescriptor.headers, dataPointDescriptor.fileDescriptor.primaryKey).join(',');
+    const recordHash = `${dimensionData}@${indicatorName}`;
+
+    if (storage.hash.has(recordHash)) {
+      storage.duplicatedHashes.push(recordHash);
     }
 
-    storage.get(ruleKeyString).hash.add(recordHash);
+    storage.hash.add(recordHash);
 
-    if (!storage.get(ruleKeyString).content[recordHash]) {
-      storage.get(ruleKeyString).content[recordHash] = [];
+    if (!storage.content[recordHash]) {
+      storage.content[recordHash] = [];
     }
 
-    storage.get(ruleKeyString).content[recordHash].push({
+    storage.content[recordHash].push({
+      file: dataPointDescriptor.fileDescriptor.file,
       record: dataPointDescriptor.record,
       line: dataPointDescriptor.line
     });
   },
-  aggregativeRule: (dataPointDescriptor, ruleKey) => {
-    const ruleKeyString = Symbol.keyFor(ruleKey);
-
-    initRuleInStorage(ruleKeyString);
-
-    const duplicates: string[] = <string[]>uniq(storage.get(ruleKeyString).duplicatedHashes);
+  aggregativeRule: (dataPointDescriptor) => {
+    const duplicates: string[] = <string[]>uniq(storage.duplicatedHashes);
     const data: any[] = [];
 
     for (const hash of duplicates) {
-      data.push(storage.get(ruleKeyString).content[hash]);
+      data.push(storage.content[hash]);
     }
 
     let issue = null;
@@ -62,7 +58,7 @@ export const rule = {
       issue = new Issue(DUPLICATED_DATA_POINT_KEY).setPath(dataPointDescriptor.fileDescriptor.fullPath).setData(data);
     }
 
-    storage.delete(ruleKeyString);
+    initStorage();
 
     return issue;
   }

diff --git a/src/ddf-rules/translation-rules/duplicated-data-point-translation-key.ts b/src/ddf-rules/translation-rules/duplicated-data-point-translation-key.ts
@@ -1,40 +1,41 @@
-import { uniq, isEmpty } from 'lodash';
+import { difference, uniq, isEmpty } from 'lodash';
 import { DUPLICATED_DATA_POINT_TRANSLATION_KEY } from '../registry';
 import { Issue } from '../issue';
 
-const md5 = require('md5');
+let storage: any;
 
-let storage: Map<string, any> = new Map();
+const initStorage = () => {
+  if (storage && storage.hash) {
+    storage.hash.clear();
+  }
+
+  storage = {
+    hash: new Set(),
+    duplicatedPrimaryKeys: []
+  };
+};
+
+initStorage();
 
 export const rule = {
   resetStorage: () => {
-    storage.clear();
+    initStorage();
   },
   isTranslation: true,
-  aggregateRecord: (dataPointDescriptor, ruleKey) => {
-    const ruleKeyString = Symbol.keyFor(ruleKey);
-
-    if (!storage.has(ruleKeyString)) {
-      storage.set(ruleKeyString, {
-        hash: new Set(),
-        duplicatedPrimaryKeys: []
-      });
-    }
-
-    const keyData = dataPointDescriptor.fileDescriptor.primaryKey
-      .map(keyPart => dataPointDescriptor.record[keyPart])
-      .join(',');
-    const recordHash = md5(keyData);
-
-    if (storage.get(ruleKeyString).hash.has(recordHash)) {
-      storage.get(ruleKeyString).duplicatedPrimaryKeys.push(keyData);
+  aggregateRecord: (dataPointDescriptor) => {
+    const sortedPrimaryKey = dataPointDescriptor.fileDescriptor.primaryKey.sort();
+    const dimensionData = sortedPrimaryKey.map(keyPart => `${keyPart}:${dataPointDescriptor.record[keyPart]}`).join(',');
+    const indicatorName = difference(dataPointDescriptor.fileDescriptor.headers, dataPointDescriptor.fileDescriptor.primaryKey).join(',');
+    const recordHash = `${dimensionData}@${indicatorName}`;
+
+    if (storage.hash.has(recordHash)) {
+      storage.duplicatedPrimaryKeys.push(recordHash);
     }
 
-    storage.get(ruleKeyString).hash.add(recordHash);
+    storage.hash.add(recordHash);
   },
-  aggregativeRule: (dataPointDescriptor, ruleKey) => {
-    const ruleKeyString = Symbol.keyFor(ruleKey);
-    const duplicates = uniq(storage.get(ruleKeyString).duplicatedPrimaryKeys);
+  aggregativeRule: (dataPointDescriptor) => {
+    const duplicates = uniq(storage.duplicatedPrimaryKeys);
 
     let issue = null;
 
@@ -44,7 +45,7 @@ export const rule = {
         .setData(duplicates);
     }
 
-    storage.delete(ruleKeyString);
+    initStorage();
 
     return issue;
   }

diff --git a/src/index.ts b/src/index.ts
@@ -57,7 +57,7 @@ export function createRecordBasedRuleProcessor(context, fileDescriptor, resultHa
               fileDescriptor,
               record,
               line
-            }, context.ruleKey);
+            });
           }
         }
       },

diff --git a/src/shared.ts b/src/shared.ts
@@ -18,7 +18,7 @@ function createRecordAggregationProcessor(context, ddfDataSet, fileDescriptor, r
         }
 
         if (isAggregativeRule(ddfRules[key])) {
-          ddfRules[key].aggregateRecord({ddfDataSet, fileDescriptor, record, line}, key);
+          ddfRules[key].aggregateRecord({ddfDataSet, fileDescriptor, record, line});
         }
       });
   };

diff --git a/test/data-point-rules.spec.ts b/test/data-point-rules.spec.ts
@@ -214,6 +214,7 @@ describe('rules for data points', () => {
             data: [
               [
                 {
+                  file: 'ddf--datapoints--gas_production_bcf--by--geo--year.csv',
                   record: {
                     geo: 'algeria',
                     year: '1977',
@@ -222,6 +223,7 @@ describe('rules for data points', () => {
                   line: 7
                 },
                 {
+                  file: 'ddf--datapoints--gas_production_bcf--by--geo--year.csv',
                   record: {
                     geo: 'algeria',
                     year: '1977',

diff --git a/test/translation-rules.spec.ts b/test/translation-rules.spec.ts
@@ -1,17 +1,17 @@
 import * as chai from 'chai';
-import {endsWith, isEqual, head, flattenDeep, compact, isEmpty} from 'lodash';
-import {parallelLimit} from 'async';
-import {createRecordBasedRuleProcessor} from '../src/index';
-import {DdfDataSet} from '../src/ddf-definitions/ddf-data-set';
+import { endsWith, isEqual, head, flattenDeep, compact, isEmpty } from 'lodash';
+import { parallelLimit } from 'async';
+import { createRecordBasedRuleProcessor } from '../src/index';
+import { DdfDataSet } from '../src/ddf-definitions/ddf-data-set';
 import {
   UNEXPECTED_TRANSLATION_HEADER,
   UNEXPECTED_TRANSLATIONS_DATA,
   UNEXPECTED_DATA_POINT_TRANSLATIONS_DATA,
   DUPLICATED_DATA_POINT_TRANSLATION_KEY,
   DUPLICATED_TRANSLATION_KEY
 } from '../src/ddf-rules/registry';
-import {allRules} from '../src/ddf-rules';
-import {Issue} from '../src/ddf-rules/issue';
+import { allRules } from '../src/ddf-rules';
+import { Issue } from '../src/ddf-rules/issue';
 
 const CONCURRENT_OPERATIONS_AMOUNT = 30;
 const expect = chai.expect;
@@ -265,7 +265,7 @@ describe('translation rules', () => {
           const results = compact(tempResults);
           const EXPECTED_RESULT = {
             path: 'lang/nl-nl/ddf--datapoints--company_size_string--by--company--anno.csv',
-            data: ['mic,2016']
+            data: ['anno:2016,company:mic@company_size_string']
           };
 
           expect(results.length).to.equal(1);
-Original file line number
+Diff line change
@@ Expand Up @@
                   fileDescriptor,
                   record,
                   line
-                }, context.ruleKey);
+                });
               }
             }
           },
@@ Expand Down @@