-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
9 changed files
with
239 additions
and
11 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,14 @@ | ||
// Max number of files being parsed concurrently, allows parsing to take place while waiting for database response | ||
export const CONCURRENT_PARSERS = 10; | ||
|
||
/* Allow small differences in matching cards to help with things like part of the cite being attached to the start of the card */ | ||
// If a card has EDGE_TOLERANCE different sentences at start or end, will be treated as if they matched all the way to the start or end | ||
export const EDGE_TOLERANCE = 1; | ||
// If a card has almost an entire card within it, with at most INSIDE_TOLERANCE sentences missing from the start or end, it will be treated as if the entire card matched | ||
export const INSIDE_TOLERANCE = 1; | ||
/* | ||
Regex used to split text into sentences | ||
Matches puncuation followed by (whitespace + capital letter) and allows citiation numbers (ex. Sample text.123 Next sentence) | ||
Will fail in some weird cases, but should be good enough | ||
*/ | ||
export const SENTENCE_REGEX = /([.?!])+(?=\d*\s+[A-Z])/; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,13 +1,21 @@ | ||
import { PrismaClient } from '@prisma/client'; | ||
import { createClient } from 'redis'; | ||
|
||
// add prisma to the NodeJS global type | ||
// add prisma and redis to the NodeJS global type | ||
interface CustomNodeJsGlobal extends NodeJS.Global { | ||
prisma: PrismaClient; | ||
redis: ReturnType<typeof createClient>; | ||
} | ||
|
||
// Prevent multiple instances of Prisma Client in development | ||
// Prevent multiple instances of databases in development | ||
declare const global: CustomNodeJsGlobal; | ||
|
||
export const db = global.prisma || new PrismaClient(); | ||
export const redis = global.redis || createClient(); | ||
|
||
if (process.env.NODE_ENV === 'development') global.prisma = db; | ||
redis.connect(); | ||
|
||
if (process.env.NODE_ENV === 'development') { | ||
global.prisma = db; | ||
global.redis = redis; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
import { toPairs } from 'lodash'; | ||
import { redis } from 'app/lib/db'; | ||
import { createHash } from 'crypto'; | ||
import { EDGE_TOLERANCE, INSIDE_TOLERANCE, SENTENCE_REGEX } from 'app/constants'; | ||
|
||
type CardMatches = Record<number, { start: number; end: number }>; | ||
export interface DedupTask { | ||
text: string; | ||
id: number; | ||
callback: (value: unknown) => void; | ||
} | ||
|
||
export const getSentences = (text: string, cutoff = 20): string[] | undefined => { | ||
return text | ||
?.split(SENTENCE_REGEX) | ||
.map((el) => el.replace(/[^A-Z]/gi, '').toLowerCase()) | ||
.filter((el: string) => el.length >= cutoff); | ||
}; | ||
|
||
/* | ||
Small hashes are stored in a memory efficient way in redis | ||
Storing data in buckets using hashes drastically reduces the overhead of storing each value | ||
https://redis.io/topics/memory-optimization | ||
*/ | ||
const getSentenceKey = (sentence: string): [string, string] => { | ||
const hash = createHash('md5').update(sentence).digest('base64'); | ||
// Uses top 18 bits as bucket, and next 36 as key | ||
// Will create around 260k buckets, each containing a few hundred items with the full dataset | ||
return ['s' + hash.slice(0, 3), hash.slice(3, 9)]; | ||
}; | ||
export const Sentence = { | ||
get: (sentence: string): Promise<string> => redis.hGet(...getSentenceKey(sentence)), | ||
set: (sentence: string, card: number): Promise<number> => redis.hSet(...getSentenceKey(sentence), card), | ||
}; | ||
|
||
export const Info = { | ||
get: (cardId: number, field: 'p' | 'l'): Promise<string> => redis.hGet(`i${cardId >> 8}`, field + (cardId % 256)), | ||
set: (cardId: number, field: 'p' | 'l', value: string | number): Promise<number> => | ||
redis.hSet(`i${cardId >> 8}`, field + (cardId % 256), value), | ||
}; | ||
|
||
export const setRedisParents = (cardIds: string[], parentId: number): Promise<number>[] => | ||
cardIds | ||
.map((id) => Info.set(+id, 'p', parentId.toString())) // Update card infos with new parent | ||
.concat(redis.sAdd(`c${parentId}`, cardIds)); // Add cards to parent's child list | ||
export const getChildren = (cardId: string): Promise<string[]> => redis.sMembers(`c${cardId}`) ?? Promise.resolve([]); | ||
|
||
export const getMatching = async (matches: (string | null)[]): Promise<(string | false)[]> => { | ||
// If no matches | ||
if (!matches.find((el) => el !== null)) return null; | ||
|
||
// Calculates length of match in case there is a gap due to typo or collision | ||
const cards: CardMatches = {}; | ||
for (let i = 0; i < matches.length; i++) { | ||
const id = matches[i]; | ||
if (id === null) continue; | ||
// If new match, set current index as start and end at end of card, otherwise update end index | ||
cards[id] ? (cards[id].end = i) : (cards[id] = { start: i, end: matches.length - 1 }); | ||
} | ||
|
||
// Filter out probably false matches | ||
return Promise.all( | ||
toPairs(cards).map(async ([key, value]) => { | ||
const { start, end } = value; | ||
// If match starts at start or ends at end it is probably a real match | ||
if (start >= EDGE_TOLERANCE || end >= matches.length - (EDGE_TOLERANCE + 1)) return key; | ||
// If dosent reach start or end, it should be the entire card inside this one | ||
return end - start - +(await Info.get(+key, 'l')) <= INSIDE_TOLERANCE && key; | ||
}), | ||
); | ||
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
import { db } from 'app/lib'; | ||
import { getSentences, Sentence, Info, getChildren, getMatching, setRedisParents, DedupTask } from 'app/lib'; | ||
import { onAddEvidence } from 'app/actions/addEvidence'; | ||
import { filter, min, uniq } from 'lodash'; | ||
import { Queue } from 'typescript-collections'; | ||
|
||
const evidenceQueue = new Queue<DedupTask>(); | ||
|
||
// Update parents in database and redis, dont need to actaully wait for database response | ||
function updateParents(cardIds: string[], parentId: number) { | ||
setRedisParents(cardIds, parentId); | ||
return db.evidence.updateMany({ where: { id: { in: cardIds.map(Number) } }, data: { parent: parentId } }); | ||
} | ||
|
||
async function setParent({ text, id }: DedupTask) { | ||
let parent = id; | ||
const updates = [id.toString()]; | ||
|
||
const sentences = getSentences(text); | ||
if (!sentences?.length) return updateParents(updates, parent); | ||
|
||
const existing = await Promise.all(sentences.map(Sentence.get)); | ||
const matching = filter(await getMatching(existing)); | ||
|
||
Info.set(id, 'l', sentences.length); | ||
if (matching.length) { | ||
// Get the parents of all the matches, use set to make sure they are unique | ||
const matchParents = uniq(await Promise.all(matching.map((card) => Info.get(+card, 'p')))); | ||
|
||
// If all matches have the same parent just set as parent | ||
if (matchParents.length === 1) parent = +matchParents[0]; | ||
else { | ||
// In rare case multiple different parents were matched, merge cards and update parents | ||
parent = +min(matchParents); | ||
|
||
await Promise.all( | ||
matchParents | ||
.filter((card) => +card !== parent) | ||
.map((card) => getChildren(card).then((children) => updates.push(...children))), | ||
); | ||
} | ||
} | ||
|
||
// Commands will be sent in order so dont need to wait for respones | ||
sentences.forEach((sentence) => Sentence.set(sentence, parent)); | ||
return updateParents(uniq(updates), parent); | ||
} | ||
|
||
onAddEvidence.on((data) => evidenceQueue.enqueue(data)); | ||
const drain = () => { | ||
// TODO: Add chunks of unduplicated cards from db if queue is empty | ||
if (evidenceQueue.size() === 0) setTimeout(drain, 1000); | ||
// Dosent actually wait for parent to be set, just till commands are sent | ||
else { | ||
const task = evidenceQueue.dequeue(); | ||
const promise = setParent(task); | ||
promise.then(task.callback); | ||
promise.then(drain); | ||
} | ||
}; | ||
|
||
drain(); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -512,6 +512,41 @@ | |
call-me-maybe "^1.0.1" | ||
glob-to-regexp "^0.3.0" | ||
|
||
"@node-redis/[email protected]": | ||
version "1.0.1" | ||
resolved "https://registry.yarnpkg.com/@node-redis/bloom/-/bloom-1.0.1.tgz#144474a0b7dc4a4b91badea2cfa9538ce0a1854e" | ||
integrity sha512-mXEBvEIgF4tUzdIN89LiYsbi6//EdpFA7L8M+DHCvePXg+bfHWi+ct5VI6nHUFQE5+ohm/9wmgihCH3HSkeKsw== | ||
|
||
"@node-redis/[email protected]": | ||
version "1.0.4" | ||
resolved "https://registry.yarnpkg.com/@node-redis/client/-/client-1.0.4.tgz#fe185750df3bcc07524f63fe8dbc8d14d22d6cbb" | ||
integrity sha512-IM/NRAqg7MvNC3bIRQipXGrEarunrdgvrbAzsd3ty93LSHi/M+ybQulOERQi8a3M+P5BL8HenwXjiIoKm6ml2g== | ||
dependencies: | ||
cluster-key-slot "1.1.0" | ||
generic-pool "3.8.2" | ||
redis-parser "3.0.0" | ||
yallist "4.0.0" | ||
|
||
"@node-redis/[email protected]": | ||
version "1.0.0" | ||
resolved "https://registry.yarnpkg.com/@node-redis/graph/-/graph-1.0.0.tgz#baf8eaac4a400f86ea04d65ec3d65715fd7951ab" | ||
integrity sha512-mRSo8jEGC0cf+Rm7q8mWMKKKqkn6EAnA9IA2S3JvUv/gaWW/73vil7GLNwion2ihTptAm05I9LkepzfIXUKX5g== | ||
|
||
"@node-redis/[email protected]": | ||
version "1.0.2" | ||
resolved "https://registry.yarnpkg.com/@node-redis/json/-/json-1.0.2.tgz#8ad2d0f026698dc1a4238cc3d1eb099a3bee5ab8" | ||
integrity sha512-qVRgn8WfG46QQ08CghSbY4VhHFgaTY71WjpwRBGEuqGPfWwfRcIf3OqSpR7Q/45X+v3xd8mvYjywqh0wqJ8T+g== | ||
|
||
"@node-redis/[email protected]": | ||
version "1.0.3" | ||
resolved "https://registry.yarnpkg.com/@node-redis/search/-/search-1.0.3.tgz#7c3d026bf994caf82019fd0c3924cfc09f041a29" | ||
integrity sha512-rsrzkGWI84di/uYtEctS/4qLusWt0DESx/psjfB0TFpORDhe7JfC0h8ary+eHulTksumor244bXLRSqQXbFJmw== | ||
|
||
"@node-redis/[email protected]": | ||
version "1.0.2" | ||
resolved "https://registry.yarnpkg.com/@node-redis/time-series/-/time-series-1.0.2.tgz#5dd3638374edd85ebe0aa6b0e87addc88fb9df69" | ||
integrity sha512-HGQ8YooJ8Mx7l28tD7XjtB3ImLEjlUxG1wC1PAjxu6hPJqjPshUZxAICzDqDjtIbhDTf48WXXUcx8TQJB1XTKA== | ||
|
||
"@nodelib/fs.stat@^1.1.2": | ||
version "1.1.3" | ||
resolved "https://registry.yarnpkg.com/@nodelib/fs.stat/-/fs.stat-1.1.3.tgz#2b5a3ab3f918cca48a8c754c08168e3f03eba61b" | ||
|
@@ -1866,6 +1901,11 @@ cliui@^6.0.0: | |
strip-ansi "^6.0.0" | ||
wrap-ansi "^6.2.0" | ||
|
||
[email protected]: | ||
version "1.1.0" | ||
resolved "https://registry.yarnpkg.com/cluster-key-slot/-/cluster-key-slot-1.1.0.tgz#30474b2a981fb12172695833052bc0d01336d10d" | ||
integrity sha512-2Nii8p3RwAPiFwsnZvukotvow2rIHM+yQ6ZcBXGHdniadkYGZYiGmkHJIbZPIV9nfv7m/U1IPMVVcAhoWFeklw== | ||
|
||
co@^4.6.0: | ||
version "4.6.0" | ||
resolved "https://registry.yarnpkg.com/co/-/co-4.6.0.tgz#6ea6bdf3d853ae54ccb8e47bfa0bf3f9031fb184" | ||
|
@@ -2938,6 +2978,11 @@ gauge@~2.7.3: | |
strip-ansi "^3.0.1" | ||
wide-align "^1.1.0" | ||
|
||
[email protected]: | ||
version "3.8.2" | ||
resolved "https://registry.yarnpkg.com/generic-pool/-/generic-pool-3.8.2.tgz#aab4f280adb522fdfbdc5e5b64d718d3683f04e9" | ||
integrity sha512-nGToKy6p3PAbYQ7p1UlWl6vSPwfwU6TMSWK7TTu+WUY4ZjyZQGniGGt2oNVvyNSpyZYSB43zMXVLcBm08MTMkg== | ||
|
||
gensync@^1.0.0-beta.2: | ||
version "1.0.0-beta.2" | ||
resolved "https://registry.yarnpkg.com/gensync/-/gensync-1.0.0-beta.2.tgz#32a6ee76c3d7f52d46b2b1ae5d93fea8580a25e0" | ||
|
@@ -5278,6 +5323,30 @@ redent@^1.0.0: | |
indent-string "^2.1.0" | ||
strip-indent "^1.0.1" | ||
|
||
redis-errors@^1.0.0: | ||
version "1.2.0" | ||
resolved "https://registry.yarnpkg.com/redis-errors/-/redis-errors-1.2.0.tgz#eb62d2adb15e4eaf4610c04afe1529384250abad" | ||
integrity sha1-62LSrbFeTq9GEMBK/hUpOEJQq60= | ||
|
||
[email protected]: | ||
version "3.0.0" | ||
resolved "https://registry.yarnpkg.com/redis-parser/-/redis-parser-3.0.0.tgz#b66d828cdcafe6b4b8a428a7def4c6bcac31c8b4" | ||
integrity sha1-tm2CjNyv5rS4pCin3vTGvKwxyLQ= | ||
dependencies: | ||
redis-errors "^1.0.0" | ||
|
||
redis@^4.0.4: | ||
version "4.0.4" | ||
resolved "https://registry.yarnpkg.com/redis/-/redis-4.0.4.tgz#b567f82f59086df38433982f7f424b48e924ec7a" | ||
integrity sha512-KaM1OAj/nGrSeybmmOWSMY0LXTGT6FVWgUZZrd2MYzXKJ+VGtqVaciGQeNMfZiQX+kDM8Ke4uttb54m2rm6V0A== | ||
dependencies: | ||
"@node-redis/bloom" "1.0.1" | ||
"@node-redis/client" "1.0.4" | ||
"@node-redis/graph" "1.0.0" | ||
"@node-redis/json" "1.0.2" | ||
"@node-redis/search" "1.0.3" | ||
"@node-redis/time-series" "1.0.2" | ||
|
||
regenerate@^1.2.1: | ||
version "1.4.2" | ||
resolved "https://registry.yarnpkg.com/regenerate/-/regenerate-1.4.2.tgz#b9346d8827e8f5a32f7ba29637d398b69014848a" | ||
|
@@ -6533,16 +6602,16 @@ y18n@^4.0.0: | |
resolved "https://registry.yarnpkg.com/y18n/-/y18n-4.0.3.tgz#b5f259c82cd6e336921efd7bfd8bf560de9eeedf" | ||
integrity sha512-JKhqTOwSrqNA1NY5lSztJ1GrBiUodLMmIZuLiDaMRJ+itFd+ABVE8XBjOvIWL+rSqNDC74LCSFmlb/U4UZ4hJQ== | ||
|
||
[email protected], yallist@^4.0.0: | ||
version "4.0.0" | ||
resolved "https://registry.yarnpkg.com/yallist/-/yallist-4.0.0.tgz#9bb92790d9c0effec63be73519e11a35019a3a72" | ||
integrity sha512-3wdGidZyq5PB084XLES5TpOSRA3wjXAlIWMhum2kRcv/41Sn2emQ0dycQW4uZXLejwKvg6EsvbdlVL+FYEct7A== | ||
|
||
yallist@^3.0.0, yallist@^3.0.3: | ||
version "3.1.1" | ||
resolved "https://registry.yarnpkg.com/yallist/-/yallist-3.1.1.tgz#dbb7daf9bfd8bac9ab45ebf602b8cbad0d5d08fd" | ||
integrity sha512-a4UGQaWPH59mOXUYnAG2ewncQS4i4F43Tv3JoAM+s2VDAmS9NsK8GpDMLrCHPksFT7h3K6TOoUNn2pb7RoXx4g== | ||
|
||
yallist@^4.0.0: | ||
version "4.0.0" | ||
resolved "https://registry.yarnpkg.com/yallist/-/yallist-4.0.0.tgz#9bb92790d9c0effec63be73519e11a35019a3a72" | ||
integrity sha512-3wdGidZyq5PB084XLES5TpOSRA3wjXAlIWMhum2kRcv/41Sn2emQ0dycQW4uZXLejwKvg6EsvbdlVL+FYEct7A== | ||
|
||
yargs-parser@^18.1.2: | ||
version "18.1.3" | ||
resolved "https://registry.yarnpkg.com/yargs-parser/-/yargs-parser-18.1.3.tgz#be68c4975c6b2abf469236b0c870362fab09a7b0" | ||
|