Skip to content

Commit

Permalink
Implement card deduplication
Browse files Browse the repository at this point in the history
  • Loading branch information
D0ugins committed Mar 29, 2022
1 parent fb0aab8 commit 23b23ee
Show file tree
Hide file tree
Showing 9 changed files with 239 additions and 11 deletions.
3 changes: 2 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
"mammoth": "^1.4.19",
"node-pandoc-promise": "^0.0.6",
"p-ratelimit": "^1.0.1",
"redis": "^4.0.4",
"sqlite3": "^5.0.0",
"tmp-promise": "^3.0.2",
"typescript-collections": "^1.3.3",
Expand Down Expand Up @@ -52,4 +53,4 @@
"tscpaths": "^0.0.9",
"typescript": "^4.5.2"
}
}
}
1 change: 1 addition & 0 deletions prisma/schema.prisma
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ model Evidence {
file File? @relation(fields: [fileId], references: [id])
fileId Int?
parent Int? // Null if unprocessed, same as id if no parent
}

model File {
Expand Down
6 changes: 4 additions & 2 deletions src/actions/addEvidence.ts
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import { Prisma } from '@prisma/client';
import { db } from 'app/lib';
import { db, DedupTask, TypedEvent } from 'app/lib';
import { omit } from 'lodash';

type EvidenceData = Omit<Prisma.EvidenceCreateInput, 'file'> & { file: Prisma.FileWhereUniqueInput };
export const onAddEvidence = new TypedEvent<DedupTask>();

export default async (data: EvidenceData): Promise<void> => {
await db.evidence.upsert({
const evidence = await db.evidence.upsert({
where: {
gid: data.gid,
},
Expand All @@ -22,5 +23,6 @@ export default async (data: EvidenceData): Promise<void> => {
},
});

await new Promise((resolve) => onAddEvidence.emit({ text: evidence.fulltext, id: evidence.id, callback: resolve }));
return;
};
13 changes: 13 additions & 0 deletions src/constants/index.ts
Original file line number Diff line number Diff line change
@@ -1 +1,14 @@
// Max number of files being parsed concurrently, allows parsing to take place while waiting for database response
export const CONCURRENT_PARSERS = 10;

/* Allow small differences in matching cards to help with things like part of the cite being attached to the start of the card */
// If a card has EDGE_TOLERANCE different sentences at start or end, will be treated as if they matched all the way to the start or end
export const EDGE_TOLERANCE = 1;
// If a card has almost an entire card within it, with at most INSIDE_TOLERANCE sentences missing from the start or end, it will be treated as if the entire card matched
export const INSIDE_TOLERANCE = 1;
/*
Regex used to split text into sentences
Matches puncuation followed by (whitespace + capital letter) and allows citiation numbers (ex. Sample text.123 Next sentence)
Will fail in some weird cases, but should be good enough
*/
export const SENTENCE_REGEX = /([.?!])+(?=\d*\s+[A-Z])/;
14 changes: 11 additions & 3 deletions src/lib/db.ts
Original file line number Diff line number Diff line change
@@ -1,13 +1,21 @@
import { PrismaClient } from '@prisma/client';
import { createClient } from 'redis';

// add prisma to the NodeJS global type
// add prisma and redis to the NodeJS global type
interface CustomNodeJsGlobal extends NodeJS.Global {
prisma: PrismaClient;
redis: ReturnType<typeof createClient>;
}

// Prevent multiple instances of Prisma Client in development
// Prevent multiple instances of databases in development
declare const global: CustomNodeJsGlobal;

export const db = global.prisma || new PrismaClient();
export const redis = global.redis || createClient();

if (process.env.NODE_ENV === 'development') global.prisma = db;
redis.connect();

if (process.env.NODE_ENV === 'development') {
global.prisma = db;
global.redis = redis;
}
71 changes: 71 additions & 0 deletions src/lib/debate-tools/duplicate.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import { toPairs } from 'lodash';
import { redis } from 'app/lib/db';
import { createHash } from 'crypto';
import { EDGE_TOLERANCE, INSIDE_TOLERANCE, SENTENCE_REGEX } from 'app/constants';

type CardMatches = Record<number, { start: number; end: number }>;
export interface DedupTask {
text: string;
id: number;
callback: (value: unknown) => void;
}

export const getSentences = (text: string, cutoff = 20): string[] | undefined => {
return text
?.split(SENTENCE_REGEX)
.map((el) => el.replace(/[^A-Z]/gi, '').toLowerCase())
.filter((el: string) => el.length >= cutoff);
};

/*
Small hashes are stored in a memory efficient way in redis
Storing data in buckets using hashes drastically reduces the overhead of storing each value
https://redis.io/topics/memory-optimization
*/
const getSentenceKey = (sentence: string): [string, string] => {
const hash = createHash('md5').update(sentence).digest('base64');
// Uses top 18 bits as bucket, and next 36 as key
// Will create around 260k buckets, each containing a few hundred items with the full dataset
return ['s' + hash.slice(0, 3), hash.slice(3, 9)];
};
export const Sentence = {
get: (sentence: string): Promise<string> => redis.hGet(...getSentenceKey(sentence)),
set: (sentence: string, card: number): Promise<number> => redis.hSet(...getSentenceKey(sentence), card),
};

export const Info = {
get: (cardId: number, field: 'p' | 'l'): Promise<string> => redis.hGet(`i${cardId >> 8}`, field + (cardId % 256)),
set: (cardId: number, field: 'p' | 'l', value: string | number): Promise<number> =>
redis.hSet(`i${cardId >> 8}`, field + (cardId % 256), value),
};

export const setRedisParents = (cardIds: string[], parentId: number): Promise<number>[] =>
cardIds
.map((id) => Info.set(+id, 'p', parentId.toString())) // Update card infos with new parent
.concat(redis.sAdd(`c${parentId}`, cardIds)); // Add cards to parent's child list
export const getChildren = (cardId: string): Promise<string[]> => redis.sMembers(`c${cardId}`) ?? Promise.resolve([]);

export const getMatching = async (matches: (string | null)[]): Promise<(string | false)[]> => {
// If no matches
if (!matches.find((el) => el !== null)) return null;

// Calculates length of match in case there is a gap due to typo or collision
const cards: CardMatches = {};
for (let i = 0; i < matches.length; i++) {
const id = matches[i];
if (id === null) continue;
// If new match, set current index as start and end at end of card, otherwise update end index
cards[id] ? (cards[id].end = i) : (cards[id] = { start: i, end: matches.length - 1 });
}

// Filter out probably false matches
return Promise.all(
toPairs(cards).map(async ([key, value]) => {
const { start, end } = value;
// If match starts at start or ends at end it is probably a real match
if (start >= EDGE_TOLERANCE || end >= matches.length - (EDGE_TOLERANCE + 1)) return key;
// If dosent reach start or end, it should be the entire card inside this one
return end - start - +(await Info.get(+key, 'l')) <= INSIDE_TOLERANCE && key;
}),
);
};
1 change: 1 addition & 0 deletions src/lib/debate-tools/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ export * from './styles';
export * from './tokens';
export * from './parse';
export * from './id';
export * from './duplicate';
62 changes: 62 additions & 0 deletions src/modules/deduplicator/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import { db } from 'app/lib';
import { getSentences, Sentence, Info, getChildren, getMatching, setRedisParents, DedupTask } from 'app/lib';
import { onAddEvidence } from 'app/actions/addEvidence';
import { filter, min, uniq } from 'lodash';
import { Queue } from 'typescript-collections';

const evidenceQueue = new Queue<DedupTask>();

// Update parents in database and redis, dont need to actaully wait for database response
function updateParents(cardIds: string[], parentId: number) {
setRedisParents(cardIds, parentId);
return db.evidence.updateMany({ where: { id: { in: cardIds.map(Number) } }, data: { parent: parentId } });
}

async function setParent({ text, id }: DedupTask) {
let parent = id;
const updates = [id.toString()];

const sentences = getSentences(text);
if (!sentences?.length) return updateParents(updates, parent);

const existing = await Promise.all(sentences.map(Sentence.get));
const matching = filter(await getMatching(existing));

Info.set(id, 'l', sentences.length);
if (matching.length) {
// Get the parents of all the matches, use set to make sure they are unique
const matchParents = uniq(await Promise.all(matching.map((card) => Info.get(+card, 'p'))));

// If all matches have the same parent just set as parent
if (matchParents.length === 1) parent = +matchParents[0];
else {
// In rare case multiple different parents were matched, merge cards and update parents
parent = +min(matchParents);

await Promise.all(
matchParents
.filter((card) => +card !== parent)
.map((card) => getChildren(card).then((children) => updates.push(...children))),
);
}
}

// Commands will be sent in order so dont need to wait for respones
sentences.forEach((sentence) => Sentence.set(sentence, parent));
return updateParents(uniq(updates), parent);
}

onAddEvidence.on((data) => evidenceQueue.enqueue(data));
const drain = () => {
// TODO: Add chunks of unduplicated cards from db if queue is empty
if (evidenceQueue.size() === 0) setTimeout(drain, 1000);
// Dosent actually wait for parent to be set, just till commands are sent
else {
const task = evidenceQueue.dequeue();
const promise = setParent(task);
promise.then(task.callback);
promise.then(drain);
}
};

drain();
79 changes: 74 additions & 5 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -512,6 +512,41 @@
call-me-maybe "^1.0.1"
glob-to-regexp "^0.3.0"

"@node-redis/[email protected]":
version "1.0.1"
resolved "https://registry.yarnpkg.com/@node-redis/bloom/-/bloom-1.0.1.tgz#144474a0b7dc4a4b91badea2cfa9538ce0a1854e"
integrity sha512-mXEBvEIgF4tUzdIN89LiYsbi6//EdpFA7L8M+DHCvePXg+bfHWi+ct5VI6nHUFQE5+ohm/9wmgihCH3HSkeKsw==

"@node-redis/[email protected]":
version "1.0.4"
resolved "https://registry.yarnpkg.com/@node-redis/client/-/client-1.0.4.tgz#fe185750df3bcc07524f63fe8dbc8d14d22d6cbb"
integrity sha512-IM/NRAqg7MvNC3bIRQipXGrEarunrdgvrbAzsd3ty93LSHi/M+ybQulOERQi8a3M+P5BL8HenwXjiIoKm6ml2g==
dependencies:
cluster-key-slot "1.1.0"
generic-pool "3.8.2"
redis-parser "3.0.0"
yallist "4.0.0"

"@node-redis/[email protected]":
version "1.0.0"
resolved "https://registry.yarnpkg.com/@node-redis/graph/-/graph-1.0.0.tgz#baf8eaac4a400f86ea04d65ec3d65715fd7951ab"
integrity sha512-mRSo8jEGC0cf+Rm7q8mWMKKKqkn6EAnA9IA2S3JvUv/gaWW/73vil7GLNwion2ihTptAm05I9LkepzfIXUKX5g==

"@node-redis/[email protected]":
version "1.0.2"
resolved "https://registry.yarnpkg.com/@node-redis/json/-/json-1.0.2.tgz#8ad2d0f026698dc1a4238cc3d1eb099a3bee5ab8"
integrity sha512-qVRgn8WfG46QQ08CghSbY4VhHFgaTY71WjpwRBGEuqGPfWwfRcIf3OqSpR7Q/45X+v3xd8mvYjywqh0wqJ8T+g==

"@node-redis/[email protected]":
version "1.0.3"
resolved "https://registry.yarnpkg.com/@node-redis/search/-/search-1.0.3.tgz#7c3d026bf994caf82019fd0c3924cfc09f041a29"
integrity sha512-rsrzkGWI84di/uYtEctS/4qLusWt0DESx/psjfB0TFpORDhe7JfC0h8ary+eHulTksumor244bXLRSqQXbFJmw==

"@node-redis/[email protected]":
version "1.0.2"
resolved "https://registry.yarnpkg.com/@node-redis/time-series/-/time-series-1.0.2.tgz#5dd3638374edd85ebe0aa6b0e87addc88fb9df69"
integrity sha512-HGQ8YooJ8Mx7l28tD7XjtB3ImLEjlUxG1wC1PAjxu6hPJqjPshUZxAICzDqDjtIbhDTf48WXXUcx8TQJB1XTKA==

"@nodelib/fs.stat@^1.1.2":
version "1.1.3"
resolved "https://registry.yarnpkg.com/@nodelib/fs.stat/-/fs.stat-1.1.3.tgz#2b5a3ab3f918cca48a8c754c08168e3f03eba61b"
Expand Down Expand Up @@ -1866,6 +1901,11 @@ cliui@^6.0.0:
strip-ansi "^6.0.0"
wrap-ansi "^6.2.0"

[email protected]:
version "1.1.0"
resolved "https://registry.yarnpkg.com/cluster-key-slot/-/cluster-key-slot-1.1.0.tgz#30474b2a981fb12172695833052bc0d01336d10d"
integrity sha512-2Nii8p3RwAPiFwsnZvukotvow2rIHM+yQ6ZcBXGHdniadkYGZYiGmkHJIbZPIV9nfv7m/U1IPMVVcAhoWFeklw==

co@^4.6.0:
version "4.6.0"
resolved "https://registry.yarnpkg.com/co/-/co-4.6.0.tgz#6ea6bdf3d853ae54ccb8e47bfa0bf3f9031fb184"
Expand Down Expand Up @@ -2938,6 +2978,11 @@ gauge@~2.7.3:
strip-ansi "^3.0.1"
wide-align "^1.1.0"

[email protected]:
version "3.8.2"
resolved "https://registry.yarnpkg.com/generic-pool/-/generic-pool-3.8.2.tgz#aab4f280adb522fdfbdc5e5b64d718d3683f04e9"
integrity sha512-nGToKy6p3PAbYQ7p1UlWl6vSPwfwU6TMSWK7TTu+WUY4ZjyZQGniGGt2oNVvyNSpyZYSB43zMXVLcBm08MTMkg==

gensync@^1.0.0-beta.2:
version "1.0.0-beta.2"
resolved "https://registry.yarnpkg.com/gensync/-/gensync-1.0.0-beta.2.tgz#32a6ee76c3d7f52d46b2b1ae5d93fea8580a25e0"
Expand Down Expand Up @@ -5278,6 +5323,30 @@ redent@^1.0.0:
indent-string "^2.1.0"
strip-indent "^1.0.1"

redis-errors@^1.0.0:
version "1.2.0"
resolved "https://registry.yarnpkg.com/redis-errors/-/redis-errors-1.2.0.tgz#eb62d2adb15e4eaf4610c04afe1529384250abad"
integrity sha1-62LSrbFeTq9GEMBK/hUpOEJQq60=

[email protected]:
version "3.0.0"
resolved "https://registry.yarnpkg.com/redis-parser/-/redis-parser-3.0.0.tgz#b66d828cdcafe6b4b8a428a7def4c6bcac31c8b4"
integrity sha1-tm2CjNyv5rS4pCin3vTGvKwxyLQ=
dependencies:
redis-errors "^1.0.0"

redis@^4.0.4:
version "4.0.4"
resolved "https://registry.yarnpkg.com/redis/-/redis-4.0.4.tgz#b567f82f59086df38433982f7f424b48e924ec7a"
integrity sha512-KaM1OAj/nGrSeybmmOWSMY0LXTGT6FVWgUZZrd2MYzXKJ+VGtqVaciGQeNMfZiQX+kDM8Ke4uttb54m2rm6V0A==
dependencies:
"@node-redis/bloom" "1.0.1"
"@node-redis/client" "1.0.4"
"@node-redis/graph" "1.0.0"
"@node-redis/json" "1.0.2"
"@node-redis/search" "1.0.3"
"@node-redis/time-series" "1.0.2"

regenerate@^1.2.1:
version "1.4.2"
resolved "https://registry.yarnpkg.com/regenerate/-/regenerate-1.4.2.tgz#b9346d8827e8f5a32f7ba29637d398b69014848a"
Expand Down Expand Up @@ -6533,16 +6602,16 @@ y18n@^4.0.0:
resolved "https://registry.yarnpkg.com/y18n/-/y18n-4.0.3.tgz#b5f259c82cd6e336921efd7bfd8bf560de9eeedf"
integrity sha512-JKhqTOwSrqNA1NY5lSztJ1GrBiUodLMmIZuLiDaMRJ+itFd+ABVE8XBjOvIWL+rSqNDC74LCSFmlb/U4UZ4hJQ==

[email protected], yallist@^4.0.0:
version "4.0.0"
resolved "https://registry.yarnpkg.com/yallist/-/yallist-4.0.0.tgz#9bb92790d9c0effec63be73519e11a35019a3a72"
integrity sha512-3wdGidZyq5PB084XLES5TpOSRA3wjXAlIWMhum2kRcv/41Sn2emQ0dycQW4uZXLejwKvg6EsvbdlVL+FYEct7A==

yallist@^3.0.0, yallist@^3.0.3:
version "3.1.1"
resolved "https://registry.yarnpkg.com/yallist/-/yallist-3.1.1.tgz#dbb7daf9bfd8bac9ab45ebf602b8cbad0d5d08fd"
integrity sha512-a4UGQaWPH59mOXUYnAG2ewncQS4i4F43Tv3JoAM+s2VDAmS9NsK8GpDMLrCHPksFT7h3K6TOoUNn2pb7RoXx4g==

yallist@^4.0.0:
version "4.0.0"
resolved "https://registry.yarnpkg.com/yallist/-/yallist-4.0.0.tgz#9bb92790d9c0effec63be73519e11a35019a3a72"
integrity sha512-3wdGidZyq5PB084XLES5TpOSRA3wjXAlIWMhum2kRcv/41Sn2emQ0dycQW4uZXLejwKvg6EsvbdlVL+FYEct7A==

yargs-parser@^18.1.2:
version "18.1.3"
resolved "https://registry.yarnpkg.com/yargs-parser/-/yargs-parser-18.1.3.tgz#be68c4975c6b2abf469236b0c870362fab09a7b0"
Expand Down

0 comments on commit 23b23ee

Please sign in to comment.