From fb0aab8691c1f4165df42795644ba6c11ad4f085 Mon Sep 17 00:00:00 2001 From: D0ugins Date: Tue, 22 Mar 2022 21:57:02 -0500 Subject: [PATCH 1/2] Parser improvements Switch to using the htmlparser2 library cheerio uses under the hood for parsing. Around 3 times faster, handles links properly, and the code is probably simpler Add full cite field to database --- package.json | 7 +- prisma/schema.prisma | 1 + src/actions/generateFile.ts | 8 +- src/constants/index.ts | 1 + src/lib/debate-tools/document.ts | 4 +- src/lib/debate-tools/markup.ts | 134 ++++++++++----------- src/lib/debate-tools/parse.ts | 6 +- src/modules/parser/index.ts | 12 +- yarn.lock | 194 ++++++++++++++++++++++++++----- 9 files changed, 247 insertions(+), 120 deletions(-) diff --git a/package.json b/package.json index 18e5d88..3effa8d 100644 --- a/package.json +++ b/package.json @@ -14,14 +14,15 @@ "cheerio": "1.0.0-rc.3", "docx": "^6.0.3", "dotenv": "^6.2.0", + "htmlparser2": "^7.2.0", "lodash": "^4.17.15", - "unzipper": "^0.10.11" - "sqlite3": "^5.0.0", "mammoth": "^1.4.19", "node-pandoc-promise": "^0.0.6", "p-ratelimit": "^1.0.1", + "sqlite3": "^5.0.0", "tmp-promise": "^3.0.2", - "typescript-collections": "^1.3.3" + "typescript-collections": "^1.3.3", + "unzipper": "^0.10.11" }, "devDependencies": { "@types/cheerio": "^0.22.21", diff --git a/prisma/schema.prisma b/prisma/schema.prisma index 4f0ee35..0013664 100644 --- a/prisma/schema.prisma +++ b/prisma/schema.prisma @@ -15,6 +15,7 @@ model Evidence { tag String cite String? + fullcite String? summary String? spoken String? fulltext String? diff --git a/src/actions/generateFile.ts b/src/actions/generateFile.ts index b3c8d51..7e584aa 100644 --- a/src/actions/generateFile.ts +++ b/src/actions/generateFile.ts @@ -37,13 +37,9 @@ const flattenLevel = (data: Evidence[], level: number): TextBlock[] => { }; export default async (ids: number[], keepHeadings: boolean): Promise => { - let evidence = await db.evidence.findMany({ - where: { - id: { in: ids }, - }, - }); + const evidence = await db.evidence.findMany({ where: { id: { in: ids } } }); - let tokens: TextBlock[] = flattenLevel(evidence, keepHeadings ? 1 : 4); + const tokens: TextBlock[] = flattenLevel(evidence, keepHeadings ? 1 : 4); onGenerateFile.emit({ ids }); return await tokensToDocument(tokens); }; diff --git a/src/constants/index.ts b/src/constants/index.ts index e69de29..bc97409 100644 --- a/src/constants/index.ts +++ b/src/constants/index.ts @@ -0,0 +1 @@ +export const CONCURRENT_PARSERS = 10; diff --git a/src/lib/debate-tools/document.ts b/src/lib/debate-tools/document.ts index 3a09c50..ed55ddc 100644 --- a/src/lib/debate-tools/document.ts +++ b/src/lib/debate-tools/document.ts @@ -10,7 +10,7 @@ import fs from 'fs'; export const documentToTokens = async (filepath: string): Promise => { const document = await loadXml(filepath, /document\.xml$/); const styles = await loadXml(filepath, /styles\.xml$/); - const tokens = markupToTokens(document, styles, { simplified: true }); + const tokens = await markupToTokens(document, styles, { simplified: true }); return tokens; }; @@ -31,7 +31,9 @@ const loadXml = (path: string, file: RegExp): Promise => { let data = ''; const stream = fs.createReadStream(path); stream + .on('error', console.error) .pipe(ParseOne(file)) + .on('error', console.error) .on('data', (chunk) => (data += chunk)) .on('end', () => resolve(data)); }); diff --git a/src/lib/debate-tools/markup.ts b/src/lib/debate-tools/markup.ts index 69e111e..b6eb834 100644 --- a/src/lib/debate-tools/markup.ts +++ b/src/lib/debate-tools/markup.ts @@ -1,17 +1,9 @@ -import ch from 'cheerio'; -import { - TextBlock, - getStyleNameByXml, - TokenStyle, - SectionStyleName, - simplifyTokens, - tokensToDocument, - getOutlineLvlName, - getStyleNameByOutlineLvl, -} from './'; +import { TextBlock, TokenStyle, TextToken, simplifyTokens, tokensToDocument } from './tokens'; +import { getStyleNameByXml, getOutlineLvlName } from './styles'; +import { Parser as XmlParser } from 'htmlparser2'; export const markupToDocument = async (xml: string, styles: string): Promise => { - const tokens = markupToTokens(xml, styles, { simplified: true }); + const tokens = await markupToTokens(xml, styles, { simplified: true }); const buffer = await tokensToDocument(tokens); return buffer; }; @@ -20,71 +12,69 @@ interface TokensOption { simplified: boolean; } -export const markupToTokens = (document: string, styles: string, options?: TokensOption): TextBlock[] => { - const blocks = tokenize(document, styles); +export async function markupToTokens(document: string, styles: string, options?: TokensOption): Promise { + const blocks = await tokenize(document, styles); if (options?.simplified) { - const simplifiedBlocks = blocks.map((block) => simplifyTokens(block)); + const simplifiedBlocks = blocks.map(simplifyTokens); return simplifiedBlocks; } return blocks; -}; - -const getChild = (el, names: string[]) => - names.reduce((acc, name) => { - return acc?.children?.find((child) => child.name === name); - }, el); - -// Extract what formatting applies to block of text -const updateElFormating = (styleEl, current?: TokenStyle): TokenStyle => { - const formatting: TokenStyle = current ? { ...current } : { underline: false, strong: false, mark: false }; - const styles = getChild(styleEl, ['w:rPr']); - if (!styles) return formatting; - - const highlight = getChild(styles, ['w:highlight']); - const bold = getChild(styles, ['w:b']); - const underline = getChild(styles, ['w:u'])?.attribs['w:val']; - - if (highlight) formatting.mark = true; - if (bold) formatting.strong = bold.attribs['w:val'] !== '0'; - if (underline) formatting.underline = underline !== 'none'; - - return formatting; -}; - -const getBlockFormat = (block): SectionStyleName => { - const stlyeNameFormat = getStyleNameByXml(getChild(block, ['w:pPr', 'w:pStyle'])?.attribs['w:val']); - if (stlyeNameFormat !== 'text') return stlyeNameFormat; +} - // Sometimes uses outline level instead of header - const outlineLvl = getChild(block, ['w:pPr', 'w:outlineLvl'])?.attribs['w:val']; - return getOutlineLvlName(parseInt(outlineLvl) + 1); +const handleStyleTag = (name: string, attribs: Record, styles: TokenStyle) => { + if (name === 'w:u') styles.underline = attribs['w:val'] !== 'none'; + else if (name === 'w:highlight') styles.mark = true; + else if (name === 'w:b') styles.strong = attribs['w:val'] !== '0'; }; -const tokenize = (xml: string, styles: string): TextBlock[] => { - const s = ch.load(styles, { xmlMode: true }); - const d = ch.load(xml, { xmlMode: true }); +const parseStyles = (styles: string): Promise> => + new Promise((resolve, reject) => { + const parsedStyles: Record = {}; + let styleName = ''; + new XmlParser( + { + onopentag(name, attribs) { + if (name === 'w:style') { + styleName = attribs['w:styleId']; + parsedStyles[styleName] = { underline: false, strong: false, mark: false }; + } else if (styleName) handleStyleTag(name, attribs, parsedStyles[styleName]); + }, + onend: () => resolve(parsedStyles), + onerror: reject, + }, + { xmlMode: true }, + ).parseComplete(styles); + }); - // Generate map of style names to formatting from styles.xml - const xmlStyles: Record = s('w\\:style') - .get() - .reduce((acc, node) => { - acc[node.attribs['w:styleId']] = updateElFormating(node); - return acc; - }, {}); - - const tokens: TextBlock[] = d('w\\:p') - .get() - .map((block) => ({ - format: getBlockFormat(block), - tokens: ch(block) - .children('w\\:r') - .get() - .map((node) => ({ - text: ch(node).text(), - // combine formatting defined in text block and formatting from style name - format: updateElFormating(node, xmlStyles[getChild(node, ['w:rPr', 'w:rStyle'])?.attribs['w:val']]), - })), - })); - - return tokens; -}; +const tokenize = (xml: string, styles: string): Promise => + new Promise((resolve, reject) => { + parseStyles(styles).then((styleData) => { + const blocks: TextBlock[] = []; + let block: TextBlock; + let token: TextToken; + new XmlParser( + { + onopentag(name, attribs) { + if (name === 'w:p') block = { format: 'text', tokens: [] }; + else if (name === 'w:pStyle') block.format = getStyleNameByXml(attribs['w:val']); + else if (name === 'w:outlineLvl') block.format = getOutlineLvlName(+attribs['w:val'] + 1); + else if (name === 'w:r') token = { text: '', format: { underline: false, strong: false, mark: false } }; + else if (token) { + if (name === 'w:rStyle') token.format = { ...styleData[attribs['w:val']] }; + else handleStyleTag(name, attribs, token.format); + } + }, + ontext(data) { + if (token) token.text += data; + }, + onclosetag(name) { + if (name === 'w:p' && block.tokens.length) blocks.push(block); + else if (name === 'w:r' && token.text) block.tokens.push(token); + }, + onend: () => resolve(blocks), + onerror: reject, + }, + { xmlMode: true }, + ).parseComplete(xml); + }); + }); diff --git a/src/lib/debate-tools/parse.ts b/src/lib/debate-tools/parse.ts index a1174ce..e51dc6a 100644 --- a/src/lib/debate-tools/parse.ts +++ b/src/lib/debate-tools/parse.ts @@ -24,10 +24,7 @@ export const getIndexesWith = (blocks: TextBlock[], styles: StyleName[]): number }; const getLastBlockWith = (blocks: TextBlock[], anchor: number, styles: StyleName[]): TextBlock => { - let ret; - const range = [...Array(anchor).keys()]; - range.forEach((idx) => (ret = styles.includes(blocks[idx].format) ? blocks[idx] : ret)); - return ret; + for (let i = anchor; i >= 0; i--) if (styles.includes(blocks[i].format)) return blocks[i]; }; export const getBlocksUntil = (blocks: TextBlock[], anchor: number, styles: StyleName[]): TextBlock[] => { @@ -51,6 +48,7 @@ const parseCard = (doc: TextBlock[], anchor = 0, idx: number) => { return { tag: extractText([tag]), cite: extractText([cite], ['strong']), + fullcite: extractText([cite]), pocket: extractHeading('pocket'), hat: extractHeading('hat'), block: extractHeading('block'), diff --git a/src/modules/parser/index.ts b/src/modules/parser/index.ts index 0f7b2cd..e7d5d4c 100644 --- a/src/modules/parser/index.ts +++ b/src/modules/parser/index.ts @@ -1,6 +1,7 @@ import addEvidence from 'app/actions/addEvidence'; import { onAddFile } from 'app/actions/addFile'; import { db, pipe } from 'app/lib'; +import { CONCURRENT_PARSERS } from 'app/constants'; import { documentToTokens, extractCards, makeChildId } from 'app/lib/debate-tools'; import { Queue } from 'typescript-collections'; @@ -9,7 +10,7 @@ const fileQueue = new Queue(); (async () => { const pending = await db.file.findMany({ where: { status: { equals: 'PENDING' } } }); pending.forEach((file) => fileQueue.add(file.gid)); - drain(); + for (let i = 0; i < CONCURRENT_PARSERS; i++) drain(); })(); const parseFile = async (gid: string) => { @@ -21,15 +22,10 @@ const parseFile = async (gid: string) => { extractCards, )(gid); - for (const i in cards) - try { - await addEvidence({ ...cards[i], gid: makeChildId(gid, +i), file: { gid } }); - } catch (e) { - console.error(e); - } - + await Promise.all(cards.map((card, i) => addEvidence({ ...card, gid: makeChildId(gid, +i), file: { gid } }))); await db.file.update({ where: { gid }, data: { status: 'PROCESSED' } }); } catch (e) { + console.error(e); await db.file.update({ where: { gid }, data: { status: 'ERROR' } }); } }; diff --git a/yarn.lock b/yarn.lock index 4f33c98..cd63601 100644 --- a/yarn.lock +++ b/yarn.lock @@ -517,22 +517,22 @@ resolved "https://registry.yarnpkg.com/@nodelib/fs.stat/-/fs.stat-1.1.3.tgz#2b5a3ab3f918cca48a8c754c08168e3f03eba61b" integrity sha512-shAmDyaQC4H92APFoIaVDHCx5bStIocgvbwQyxPRrbUY20V1EYTbSDchWbuwlMG3V17cprZhA6+78JfB+3DTPw== -"@prisma/client@^2.22.1": - version "2.22.1" - resolved "https://registry.yarnpkg.com/@prisma/client/-/client-2.22.1.tgz#10fdcd1532a6baf46dd1c464cad9a54af0532bc8" - integrity sha512-JQjbsY6QSfFiovXHEp5WeJHa5p2CuR1ZFPAeYXmUsOAQOaMCrhgQmKAL6w2Q3SRA7ALqPjrKywN9/QfBc4Kp1A== +"@prisma/client@^3.8.1": + version "3.11.1" + resolved "https://registry.yarnpkg.com/@prisma/client/-/client-3.11.1.tgz#bde6dec71ae133d04ce1c6658e3d76627a3c6dc7" + integrity sha512-B3C7zQG4HbjJzUr2Zg9UVkBJutbqq9/uqkl1S138+keZCubJrwizx3RuIvGwI+s+pm3qbsyNqXiZgL3Ir0fSng== dependencies: - "@prisma/engines-version" "2.22.0-21.60cc71d884972ab4e897f0277c4b84383dddaf6c" + "@prisma/engines-version" "3.11.1-1.1a2506facaf1a4727b7c26850735e88ec779dee9" -"@prisma/engines-version@2.22.0-21.60cc71d884972ab4e897f0277c4b84383dddaf6c": - version "2.22.0-21.60cc71d884972ab4e897f0277c4b84383dddaf6c" - resolved "https://registry.yarnpkg.com/@prisma/engines-version/-/engines-version-2.22.0-21.60cc71d884972ab4e897f0277c4b84383dddaf6c.tgz#e98ee17217a0ebb54f2f9314fbbfd610b05e6e31" - integrity sha512-OkkVwk6iTzTbwwl8JIKAENyxmh4TFORal55QMKQzrHEY8UzbD0M90mQnmziz3PAopQUZgTFFMlaPAq1WNrLMtA== +"@prisma/engines-version@3.11.1-1.1a2506facaf1a4727b7c26850735e88ec779dee9": + version "3.11.1-1.1a2506facaf1a4727b7c26850735e88ec779dee9" + resolved "https://registry.yarnpkg.com/@prisma/engines-version/-/engines-version-3.11.1-1.1a2506facaf1a4727b7c26850735e88ec779dee9.tgz#81a1835b495ad287ad7824dbd62f74e9eee90fb9" + integrity sha512-HkcsDniA4iNb/gi0iuyOJNAM7nD/LwQ0uJm15v360O5dee3TM4lWdSQiTYBMK6FF68ACUItmzSur7oYuUZ2zkQ== -"@prisma/engines@2.22.0-21.60cc71d884972ab4e897f0277c4b84383dddaf6c": - version "2.22.0-21.60cc71d884972ab4e897f0277c4b84383dddaf6c" - resolved "https://registry.yarnpkg.com/@prisma/engines/-/engines-2.22.0-21.60cc71d884972ab4e897f0277c4b84383dddaf6c.tgz#4ccd255e0823605db3d8387a5195b6fdabe3b0c0" - integrity sha512-KmWdogrsfsSLYvfqY3cS3QcDGzaEFklE+T6dNJf+k/KPQum4A29IwDalafMwh5cMN8ivZobUbowNSwWJrMT08Q== +"@prisma/engines@3.11.1-1.1a2506facaf1a4727b7c26850735e88ec779dee9": + version "3.11.1-1.1a2506facaf1a4727b7c26850735e88ec779dee9" + resolved "https://registry.yarnpkg.com/@prisma/engines/-/engines-3.11.1-1.1a2506facaf1a4727b7c26850735e88ec779dee9.tgz#09ac23f8f615a8586d8d44538060ada199fe872c" + integrity sha512-MILbsGnvmnhCbFGa2/iSnsyGyazU3afzD7ldjCIeLIGKkNBMSZgA2IvpYsAXl+6qFHKGrS3B2otKfV31dwMSQw== "@sinonjs/commons@^1.7.0": version "1.8.3" @@ -937,7 +937,7 @@ arg@^4.1.0: resolved "https://registry.yarnpkg.com/arg/-/arg-4.1.3.tgz#269fc7ad5b8e42cb63c896d5666017261c144089" integrity sha512-58S9QDqG0Xx27YwPSt9fJxivjYl432YCwfDMfZ+71RAqUrZef7LrKQZ3LHLOwCS4FLNBplP533Zx895SeOCHvA== -argparse@^1.0.7: +argparse@^1.0.7, argparse@~1.0.3: version "1.0.10" resolved "https://registry.yarnpkg.com/argparse/-/argparse-1.0.10.tgz#bcd6791ea5ae09725e17e5ad988134cd40b3d911" integrity sha512-o5Roy6tNG4SL/FOkCAN6RzjiakZS25RLYFrcMttJqbdd8BWrnA+fGz57iN5Pb06pvBGvl5gQ0B48dJlslXvoTg== @@ -1613,7 +1613,7 @@ block-stream@*: dependencies: inherits "~2.0.0" -bluebird@~3.4.1: +bluebird@~3.4.0, bluebird@~3.4.1: version "3.4.7" resolved "https://registry.yarnpkg.com/bluebird/-/bluebird-3.4.7.tgz#f72d760be09b7f76d08ed8fae98b289a8d05fab3" integrity sha1-9y12C+Cbf3bQjtj66Ysomo0F+rM= @@ -2185,6 +2185,11 @@ diff@^4.0.1: resolved "https://registry.yarnpkg.com/diff/-/diff-4.0.2.tgz#60f3aecb89d5fae520c11aa19efc2bb982aade7d" integrity sha512-58lmxKSA4BNyLz+HHMUzlOEpg09FV+ev6ZMe3vJihgdxzgcwZ8VoEEPmALCZG9LmqfVoNMMKpttIYTVG6uDY7A== +dingbat-to-unicode@^1.0.1: + version "1.0.1" + resolved "https://registry.yarnpkg.com/dingbat-to-unicode/-/dingbat-to-unicode-1.0.1.tgz#5091dd673241453e6b5865e26e5a4452cdef5c83" + integrity sha512-98l0sW87ZT58pU4i61wa2OHwxbiYSbuxsCBozaVnYX2iCnr3bLM3fIes1/ej7h1YdOKuKt/MLs706TVnALA65w== + dir-glob@^2.2.2: version "2.2.2" resolved "https://registry.yarnpkg.com/dir-glob/-/dir-glob-2.2.2.tgz#fa09f0694153c8918b18ba0deafae94769fc50c4" @@ -2226,6 +2231,15 @@ dom-serializer@0: domelementtype "^2.0.1" entities "^2.0.0" +dom-serializer@^1.0.1: + version "1.3.2" + resolved "https://registry.yarnpkg.com/dom-serializer/-/dom-serializer-1.3.2.tgz#6206437d32ceefaec7161803230c7a20bc1b4d91" + integrity sha512-5c54Bk5Dw4qAxNOI1pFEizPSjVsx5+bpJKmL2kPn8JhBUq2q09tTCa3mjijun2NfK78NMouDYNMBkOrPZiS+ig== + dependencies: + domelementtype "^2.0.1" + domhandler "^4.2.0" + entities "^2.0.0" + dom-serializer@~0.1.1: version "0.1.1" resolved "https://registry.yarnpkg.com/dom-serializer/-/dom-serializer-0.1.1.tgz#1ec4059e284babed36eec2941d4a970a189ce7c0" @@ -2239,7 +2253,7 @@ domelementtype@1, domelementtype@^1.3.0, domelementtype@^1.3.1: resolved "https://registry.yarnpkg.com/domelementtype/-/domelementtype-1.3.1.tgz#d048c44b37b0d10a7f2a3d5fee3f4333d790481f" integrity sha512-BSKB+TSpMpFI/HOxCNr1O8aMOTZ8hT3pM3GQ0w/mWRmkhEDSFJkkyzz4XQsBV44BChwGkrDfMyjVD0eA2aFV3w== -domelementtype@^2.0.1: +domelementtype@^2.0.1, domelementtype@^2.2.0: version "2.2.0" resolved "https://registry.yarnpkg.com/domelementtype/-/domelementtype-2.2.0.tgz#9a0b6c2782ed6a1c7323d42267183df9bd8b1d57" integrity sha512-DtBMo82pv1dFtUmHyr48beiuq792Sxohr+8Hm9zoxklYPfa6n0Z3Byjj2IV7bmr2IyqClnqEQhfgHJJ5QF0R5A== @@ -2258,6 +2272,13 @@ domhandler@^2.3.0: dependencies: domelementtype "1" +domhandler@^4.2.0, domhandler@^4.2.2: + version "4.3.1" + resolved "https://registry.yarnpkg.com/domhandler/-/domhandler-4.3.1.tgz#8d792033416f59d68bc03a5aa7b018c1ca89279c" + integrity sha512-GrwoxYN+uWlzO8uhUXRl0P+kHE4GtVPfYzVLcUxPL7KNdHKj66vvlhiweIHqYYXWlw+T8iLMp42Lm67ghw4WMQ== + dependencies: + domelementtype "^2.2.0" + domutils@1.5.1: version "1.5.1" resolved "https://registry.yarnpkg.com/domutils/-/domutils-1.5.1.tgz#dcd8488a26f563d61079e48c9f7b7e32373682cf" @@ -2274,11 +2295,27 @@ domutils@^1.5.1: dom-serializer "0" domelementtype "1" +domutils@^2.8.0: + version "2.8.0" + resolved "https://registry.yarnpkg.com/domutils/-/domutils-2.8.0.tgz#4437def5db6e2d1f5d6ee859bd95ca7d02048135" + integrity sha512-w96Cjofp72M5IIhpjgobBimYEfoPjx1Vx0BSX9P30WBdZW2WIKU0T1Bd0kz2eNZ9ikjKgHbEyKx8BB6H1L3h3A== + dependencies: + dom-serializer "^1.0.1" + domelementtype "^2.2.0" + domhandler "^4.2.0" + dotenv@^6.2.0: version "6.2.0" resolved "https://registry.yarnpkg.com/dotenv/-/dotenv-6.2.0.tgz#941c0410535d942c8becf28d3f357dbd9d476064" integrity sha512-HygQCKUBSFl8wKQZBSemMywRWcEDNidvNbjGVyZu3nbZ8qq9ubiPoGLMdRDpfSrpkkm9BXYFkpKxxFX38o/76w== +duck@^0.1.12: + version "0.1.12" + resolved "https://registry.yarnpkg.com/duck/-/duck-0.1.12.tgz#de7adf758421230b6d7aee799ce42670586b9efa" + integrity sha512-wkctla1O6VfP89gQ+J/yDesM0S7B7XLXjKGzXxMDVFg7uEn706niAtyYovKbyq1oT9YwDcly721/iUWoc8MVRg== + dependencies: + underscore "^1.13.1" + duplexer2@~0.1.4: version "0.1.4" resolved "https://registry.yarnpkg.com/duplexer2/-/duplexer2-0.1.4.tgz#8b12dab878c0d69e3e7891051662a32fc6bddcc1" @@ -2340,6 +2377,11 @@ entities@^2.0.0: resolved "https://registry.yarnpkg.com/entities/-/entities-2.2.0.tgz#098dc90ebb83d8dffa089d55256b351d34c4da55" integrity sha512-p92if5Nz619I0w+akJrLZH0MX0Pb5DX39XOwQTtXSdQQOaYH03S1uIQp4mhOZtAXrxq4ViO67YTiLBo2638o9A== +entities@^3.0.1: + version "3.0.1" + resolved "https://registry.yarnpkg.com/entities/-/entities-3.0.1.tgz#2b887ca62585e96db3903482d336c1006c3001d4" + integrity sha512-WiyBqoomrwMdFG1e0kqvASYfnlb0lp8M5o5Fw2OFq1hNZxxcNk8Ik0Xm7LxzBhuidnZB/UtBqVCgUz3kBOP51Q== + error-ex@^1.2.0, error-ex@^1.3.1: version "1.3.2" resolved "https://registry.yarnpkg.com/error-ex/-/error-ex-1.3.2.tgz#b4ac40648107fdcdcfae242f428bea8a14d4f1bf" @@ -2405,7 +2447,7 @@ escodegen@^2.0.0: optionalDependencies: source-map "~0.6.1" -eslint-config-airbnb-base@^14.2.0: +eslint-config-airbnb-base@^14.2.0, eslint-config-airbnb-base@^14.2.1: version "14.2.1" resolved "https://registry.yarnpkg.com/eslint-config-airbnb-base/-/eslint-config-airbnb-base-14.2.1.tgz#8a2eb38455dc5a312550193b319cdaeef042cd1e" integrity sha512-GOrQyDtVEc1Xy20U7vsB2yAoB4nBlfH5HZJeatRXHleO+OS5Ot+MWij4Dpltw4/DyIkqUfqz1epfhVR5XWWQPA== @@ -2414,6 +2456,13 @@ eslint-config-airbnb-base@^14.2.0: object.assign "^4.1.2" object.entries "^1.1.2" +eslint-config-airbnb-typescript@^15.0.0: + version "15.0.0" + resolved "https://registry.yarnpkg.com/eslint-config-airbnb-typescript/-/eslint-config-airbnb-typescript-15.0.0.tgz#c88007b3cca5dd0f47125420ca5e8f6efac418fd" + integrity sha512-DTWGwqytbTnB8kSKtmkrGkRf3xwTs2l15shSH0w/3Img47AQwCCrIA/ON/Uj0XXBxP31LHyEItPXeuH3mqCNLA== + dependencies: + eslint-config-airbnb-base "^14.2.1" + eslint-config-airbnb@18.2.0: version "18.2.0" resolved "https://registry.yarnpkg.com/eslint-config-airbnb/-/eslint-config-airbnb-18.2.0.tgz#8a82168713effce8fc08e10896a63f1235499dcd" @@ -3154,6 +3203,16 @@ htmlparser2@^3.9.1: inherits "^2.0.1" readable-stream "^3.1.1" +htmlparser2@^7.2.0: + version "7.2.0" + resolved "https://registry.yarnpkg.com/htmlparser2/-/htmlparser2-7.2.0.tgz#8817cdea38bbc324392a90b1990908e81a65f5a5" + integrity sha512-H7MImA4MS6cw7nbyURtLPO1Tms7C5H602LRETv95z1MxO/7CP7rDVROehUYeYBUYEON94NXXDEPmZuq+hX4sog== + dependencies: + domelementtype "^2.0.1" + domhandler "^4.2.2" + domutils "^2.8.0" + entities "^3.0.1" + http-signature@~1.2.0: version "1.2.0" resolved "https://registry.yarnpkg.com/http-signature/-/http-signature-1.2.0.tgz#9aecd925114772f3d95b65a60abb8f7c18fbace1" @@ -4078,6 +4137,16 @@ jszip@*, jszip@^3.1.5: readable-stream "~2.3.6" set-immediate-shim "~1.0.1" +jszip@^3.7.1: + version "3.7.1" + resolved "https://registry.yarnpkg.com/jszip/-/jszip-3.7.1.tgz#bd63401221c15625a1228c556ca8a68da6fda3d9" + integrity sha512-ghL0tz1XG9ZEmRMcEN2vt7xabrDdqHHeykgARpmZ0BiIctWxM47Vt63ZO2dnp4QYt/xJVLLy5Zv1l/xRdh2byg== + dependencies: + lie "~3.3.0" + pako "~1.0.2" + readable-stream "~2.3.6" + set-immediate-shim "~1.0.1" + kind-of@^3.0.2, kind-of@^3.0.3, kind-of@^3.2.0: version "3.2.2" resolved "https://registry.yarnpkg.com/kind-of/-/kind-of-3.2.2.tgz#31ea21a734bab9bbb0f32466d893aea51e4a3c64" @@ -4211,6 +4280,15 @@ loose-envify@^1.0.0: dependencies: js-tokens "^3.0.0 || ^4.0.0" +lop@^0.4.1: + version "0.4.1" + resolved "https://registry.yarnpkg.com/lop/-/lop-0.4.1.tgz#744f1696ef480e68ce1947fe557b09db5af2a738" + integrity sha512-9xyho9why2A2tzm5aIcMWKvzqKsnxrf9B5I+8O30olh6lQU8PH978LqZoI4++37RBgS1Em5i54v1TFs/3wnmXQ== + dependencies: + duck "^0.1.12" + option "~0.2.1" + underscore "^1.13.1" + loud-rejection@^1.0.0: version "1.6.0" resolved "https://registry.yarnpkg.com/loud-rejection/-/loud-rejection-1.6.0.tgz#5b46f80147edee578870f086d04821cf998e551f" @@ -4245,6 +4323,21 @@ makeerror@1.0.x: dependencies: tmpl "1.0.x" +mammoth@^1.4.19: + version "1.4.21" + resolved "https://registry.yarnpkg.com/mammoth/-/mammoth-1.4.21.tgz#7eb89d80e396181cfa775b7fc34dcad9aa882c46" + integrity sha512-znix1/i7LtfVMAe6mmrxJSoN1HmTASJ48I+PofSHs4vPKCuPF1DDOCVeMgZTYpitzjWTWs9Xhxxi2LBbHpmovg== + dependencies: + argparse "~1.0.3" + bluebird "~3.4.0" + dingbat-to-unicode "^1.0.1" + jszip "^3.7.1" + lop "^0.4.1" + path-is-absolute "^1.0.0" + sax "~1.1.1" + underscore "^1.13.1" + xmlbuilder "^10.0.0" + map-cache@^0.2.2: version "0.2.2" resolved "https://registry.yarnpkg.com/map-cache/-/map-cache-0.2.2.tgz#c32abd0bd6525d9b051645bb4f26ac5dc98a0dbf" @@ -4480,6 +4573,11 @@ node-notifier@^8.0.0: uuid "^8.3.0" which "^2.0.2" +node-pandoc-promise@^0.0.6: + version "0.0.6" + resolved "https://registry.yarnpkg.com/node-pandoc-promise/-/node-pandoc-promise-0.0.6.tgz#8410de8e42ed966d570ea730a3c9f731a4dc7a1d" + integrity sha512-3+8hezyBOaWqUWypHtfv8sPa0Yo0sE/KUvjLN1gm48wrwqG2B45fhgIU73H1adF33FXiXkPeTW3lw5cjicemdg== + node-pre-gyp@^0.11.0: version "0.11.0" resolved "https://registry.yarnpkg.com/node-pre-gyp/-/node-pre-gyp-0.11.0.tgz#db1f33215272f692cd38f03238e3e9b47c5dd054" @@ -4687,6 +4785,11 @@ onetime@^5.1.0: dependencies: mimic-fn "^2.1.0" +option@~0.2.1: + version "0.2.4" + resolved "https://registry.yarnpkg.com/option/-/option-0.2.4.tgz#fd475cdf98dcabb3cb397a3ba5284feb45edbfe4" + integrity sha1-/Udc35jcq7PLOXo7pShP60Xtv+Q= + optionator@^0.8.1: version "0.8.3" resolved "https://registry.yarnpkg.com/optionator/-/optionator-0.8.3.tgz#84fa1d036fe9d3c7e21d99884b601167ec8fb495" @@ -4774,6 +4877,11 @@ p-locate@^4.1.0: dependencies: p-limit "^2.2.0" +p-ratelimit@^1.0.1: + version "1.0.1" + resolved "https://registry.yarnpkg.com/p-ratelimit/-/p-ratelimit-1.0.1.tgz#a07fb2419f9811feb99fb687f97e00aa0244ac3e" + integrity sha512-tKBGoow6aWRH68K2eQx+qc1gSegjd5VLirZYc1Yms9pPFsYQ9TFI6aMn0vJH2vmvzjNpjlWZOFft4aPUen2w0A== + p-try@^1.0.0: version "1.0.0" resolved "https://registry.yarnpkg.com/p-try/-/p-try-1.0.0.tgz#cbc79cdbaf8fd4228e13f621f2b1a237c1b207b3" @@ -5012,12 +5120,12 @@ pretty-format@^26.0.0, pretty-format@^26.6.2: ansi-styles "^4.0.0" react-is "^17.0.1" -prisma@2.22.1: - version "2.22.1" - resolved "https://registry.yarnpkg.com/prisma/-/prisma-2.22.1.tgz#884687a90c7b797b34c6110ea413049078c8da6e" - integrity sha512-hwvCM3zyxgSda/+/p+GW7nz93jRebtMU01wAG7YVVnl0OKU+dpw1wPvPFmQRldkZHk8fTCleYmjc24WaSdVPZQ== +prisma@^3.8.1: + version "3.11.1" + resolved "https://registry.yarnpkg.com/prisma/-/prisma-3.11.1.tgz#fff9c0bcf83cb30c2e1d650882d5eb3c5565e028" + integrity sha512-aYn8bQwt1xwR2oSsVNHT4PXU7EhsThIwmpNB/MNUaaMx5OPLTro6VdNJe/sJssXFLxhamfWeMjwmpXjljo6xkg== dependencies: - "@prisma/engines" "2.22.0-21.60cc71d884972ab4e897f0277c4b84383dddaf6c" + "@prisma/engines" "3.11.1-1.1a2506facaf1a4727b7c26850735e88ec779dee9" private@^0.1.6, private@^0.1.8: version "0.1.8" @@ -5403,6 +5511,11 @@ sax@^1.2.4: resolved "https://registry.yarnpkg.com/sax/-/sax-1.2.4.tgz#2816234e2378bddc4e5354fab5caa895df7100d9" integrity sha512-NqVDv9TpANUjFm0N8uM5GxL36UgKi9/atZw+x7YFnQ8ckwFGKrl4xX4yWtrey3UJm5nP1kUbnYgLopqWNSRhWw== +sax@~1.1.1: + version "1.1.6" + resolved "https://registry.yarnpkg.com/sax/-/sax-1.1.6.tgz#5d616be8a5e607d54e114afae55b7eaf2fcc3240" + integrity sha1-XWFr6KXmB9VOEUr65Vt+ry/MMkA= + saxes@^5.0.1: version "5.0.1" resolved "https://registry.yarnpkg.com/saxes/-/saxes-5.0.1.tgz#eebab953fa3b7608dbe94e5dadb15c888fa6696d" @@ -5901,6 +6014,20 @@ throat@^5.0.0: resolved "https://registry.yarnpkg.com/throat/-/throat-5.0.0.tgz#c5199235803aad18754a667d659b5e72ce16764b" integrity sha512-fcwX4mndzpLQKBS1DVYhGAcYaYt7vsHNIvQV+WXMvnow5cgjPphq5CaayLaGsjRdSCKZFNGt7/GYAuXaNOiYCA== +tmp-promise@^3.0.2: + version "3.0.3" + resolved "https://registry.yarnpkg.com/tmp-promise/-/tmp-promise-3.0.3.tgz#60a1a1cc98c988674fcbfd23b6e3367bdeac4ce7" + integrity sha512-RwM7MoPojPxsOBYnyd2hy0bxtIlVrihNs9pj5SUvY8Zz1sQcQG2tG1hSr8PDxfgEB8RNKDhqbIlroIarSNDNsQ== + dependencies: + tmp "^0.2.0" + +tmp@^0.2.0: + version "0.2.1" + resolved "https://registry.yarnpkg.com/tmp/-/tmp-0.2.1.tgz#8457fc3037dcf4719c251367a1af6500ee1ccf14" + integrity sha512-76SUhtfqR2Ijn+xllcI5P1oyannHNHByD80W1q447gU3mp9G9PSpGdWmjUOHRDPiHYacIk66W7ubDTuPF3BEtQ== + dependencies: + rimraf "^3.0.0" + tmpl@1.0.x: version "1.0.4" resolved "https://registry.yarnpkg.com/tmpl/-/tmpl-1.0.4.tgz#23640dd7b42d00433911140820e5cf440e521dd1" @@ -6119,10 +6246,15 @@ typedarray-to-buffer@^3.1.5: dependencies: is-typedarray "^1.0.0" -typescript@^4.2.4: - version "4.2.4" - resolved "https://registry.yarnpkg.com/typescript/-/typescript-4.2.4.tgz#8610b59747de028fda898a8aef0e103f156d0961" - integrity sha512-V+evlYHZnQkaz8TRBuxTA92yZBPotr5H+WhQ7bD3hZUndx5tGOa1fuCgeSjxAzM1RiN5IzvadIXTVefuuwZCRg== +typescript-collections@^1.3.3: + version "1.3.3" + resolved "https://registry.yarnpkg.com/typescript-collections/-/typescript-collections-1.3.3.tgz#62d50d93c018c094d425eabee649f00ec5cc0fea" + integrity sha512-7sI4e/bZijOzyURng88oOFZCISQPTHozfE2sUu5AviFYk5QV7fYGb6YiDl+vKjF/pICA354JImBImL9XJWUvdQ== + +typescript@^4.5.2: + version "4.6.3" + resolved "https://registry.yarnpkg.com/typescript/-/typescript-4.6.3.tgz#eefeafa6afdd31d725584c67a0eaba80f6fc6c6c" + integrity sha512-yNIatDa5iaofVozS/uQJEl3JRWLKKGJKh6Yaiv0GLGSuhpFJe7P3SbHZ8/yjAHRQwKRoA6YZqlfjXWmVzoVSMw== unbox-primitive@^1.0.0: version "1.0.1" @@ -6134,6 +6266,11 @@ unbox-primitive@^1.0.0: has-symbols "^1.0.2" which-boxed-primitive "^1.0.2" +underscore@^1.13.1: + version "1.13.2" + resolved "https://registry.yarnpkg.com/underscore/-/underscore-1.13.2.tgz#276cea1e8b9722a8dbed0100a407dda572125881" + integrity sha512-ekY1NhRzq0B08g4bGuX4wd2jZx5GnKz6mKSqFL4nqBlfyMGiG10gDFhDTMEfYmDL6Jy0FUIZp7wiRB+0BP7J2g== + union-value@^1.0.0: version "1.0.1" resolved "https://registry.yarnpkg.com/union-value/-/union-value-1.0.1.tgz#0b6fe7b835aecda61c6ea4d4f02c14221e109847" @@ -6376,6 +6513,11 @@ xml@^1.0.1: resolved "https://registry.yarnpkg.com/xml/-/xml-1.0.1.tgz#78ba72020029c5bc87b8a81a3cfcd74b4a2fc1e5" integrity sha1-eLpyAgApxbyHuKgaPPzXS0ovweU= +xmlbuilder@^10.0.0: + version "10.1.1" + resolved "https://registry.yarnpkg.com/xmlbuilder/-/xmlbuilder-10.1.1.tgz#8cae6688cc9b38d850b7c8d3c0a4161dcaf475b0" + integrity sha512-OyzrcFLL/nb6fMGHbiRDuPup9ljBycsdCypwuyg5AAHvyWzGfChJpCXMG88AGTIMFhGZ9RccFN1e6lhg3hkwKg== + xmlchars@^2.2.0: version "2.2.0" resolved "https://registry.yarnpkg.com/xmlchars/-/xmlchars-2.2.0.tgz#060fe1bcb7f9c76fe2a17db86a9bc3ab894210cb" From 23b23ee8ad75618c1959755b5ae4c2b3b3c718f4 Mon Sep 17 00:00:00 2001 From: D0ugins Date: Tue, 29 Mar 2022 11:41:53 -0500 Subject: [PATCH 2/2] Implement card deduplication --- package.json | 3 +- prisma/schema.prisma | 1 + src/actions/addEvidence.ts | 6 ++- src/constants/index.ts | 13 +++++ src/lib/db.ts | 14 ++++-- src/lib/debate-tools/duplicate.ts | 71 +++++++++++++++++++++++++++ src/lib/debate-tools/index.ts | 1 + src/modules/deduplicator/index.ts | 62 ++++++++++++++++++++++++ yarn.lock | 79 +++++++++++++++++++++++++++++-- 9 files changed, 239 insertions(+), 11 deletions(-) create mode 100644 src/lib/debate-tools/duplicate.ts create mode 100644 src/modules/deduplicator/index.ts diff --git a/package.json b/package.json index 3effa8d..c1339cf 100644 --- a/package.json +++ b/package.json @@ -19,6 +19,7 @@ "mammoth": "^1.4.19", "node-pandoc-promise": "^0.0.6", "p-ratelimit": "^1.0.1", + "redis": "^4.0.4", "sqlite3": "^5.0.0", "tmp-promise": "^3.0.2", "typescript-collections": "^1.3.3", @@ -52,4 +53,4 @@ "tscpaths": "^0.0.9", "typescript": "^4.5.2" } -} +} \ No newline at end of file diff --git a/prisma/schema.prisma b/prisma/schema.prisma index 0013664..bfcafb0 100644 --- a/prisma/schema.prisma +++ b/prisma/schema.prisma @@ -26,6 +26,7 @@ model Evidence { file File? @relation(fields: [fileId], references: [id]) fileId Int? + parent Int? // Null if unprocessed, same as id if no parent } model File { diff --git a/src/actions/addEvidence.ts b/src/actions/addEvidence.ts index d601076..211d420 100644 --- a/src/actions/addEvidence.ts +++ b/src/actions/addEvidence.ts @@ -1,11 +1,12 @@ import { Prisma } from '@prisma/client'; -import { db } from 'app/lib'; +import { db, DedupTask, TypedEvent } from 'app/lib'; import { omit } from 'lodash'; type EvidenceData = Omit & { file: Prisma.FileWhereUniqueInput }; +export const onAddEvidence = new TypedEvent(); export default async (data: EvidenceData): Promise => { - await db.evidence.upsert({ + const evidence = await db.evidence.upsert({ where: { gid: data.gid, }, @@ -22,5 +23,6 @@ export default async (data: EvidenceData): Promise => { }, }); + await new Promise((resolve) => onAddEvidence.emit({ text: evidence.fulltext, id: evidence.id, callback: resolve })); return; }; diff --git a/src/constants/index.ts b/src/constants/index.ts index bc97409..208a9b3 100644 --- a/src/constants/index.ts +++ b/src/constants/index.ts @@ -1 +1,14 @@ +// Max number of files being parsed concurrently, allows parsing to take place while waiting for database response export const CONCURRENT_PARSERS = 10; + +/* Allow small differences in matching cards to help with things like part of the cite being attached to the start of the card */ +// If a card has EDGE_TOLERANCE different sentences at start or end, will be treated as if they matched all the way to the start or end +export const EDGE_TOLERANCE = 1; +// If a card has almost an entire card within it, with at most INSIDE_TOLERANCE sentences missing from the start or end, it will be treated as if the entire card matched +export const INSIDE_TOLERANCE = 1; +/* + Regex used to split text into sentences + Matches puncuation followed by (whitespace + capital letter) and allows citiation numbers (ex. Sample text.123 Next sentence) + Will fail in some weird cases, but should be good enough +*/ +export const SENTENCE_REGEX = /([.?!])+(?=\d*\s+[A-Z])/; diff --git a/src/lib/db.ts b/src/lib/db.ts index a5f4df3..7ad19d6 100644 --- a/src/lib/db.ts +++ b/src/lib/db.ts @@ -1,13 +1,21 @@ import { PrismaClient } from '@prisma/client'; +import { createClient } from 'redis'; -// add prisma to the NodeJS global type +// add prisma and redis to the NodeJS global type interface CustomNodeJsGlobal extends NodeJS.Global { prisma: PrismaClient; + redis: ReturnType; } -// Prevent multiple instances of Prisma Client in development +// Prevent multiple instances of databases in development declare const global: CustomNodeJsGlobal; export const db = global.prisma || new PrismaClient(); +export const redis = global.redis || createClient(); -if (process.env.NODE_ENV === 'development') global.prisma = db; +redis.connect(); + +if (process.env.NODE_ENV === 'development') { + global.prisma = db; + global.redis = redis; +} diff --git a/src/lib/debate-tools/duplicate.ts b/src/lib/debate-tools/duplicate.ts new file mode 100644 index 0000000..fd1ae04 --- /dev/null +++ b/src/lib/debate-tools/duplicate.ts @@ -0,0 +1,71 @@ +import { toPairs } from 'lodash'; +import { redis } from 'app/lib/db'; +import { createHash } from 'crypto'; +import { EDGE_TOLERANCE, INSIDE_TOLERANCE, SENTENCE_REGEX } from 'app/constants'; + +type CardMatches = Record; +export interface DedupTask { + text: string; + id: number; + callback: (value: unknown) => void; +} + +export const getSentences = (text: string, cutoff = 20): string[] | undefined => { + return text + ?.split(SENTENCE_REGEX) + .map((el) => el.replace(/[^A-Z]/gi, '').toLowerCase()) + .filter((el: string) => el.length >= cutoff); +}; + +/* + Small hashes are stored in a memory efficient way in redis + Storing data in buckets using hashes drastically reduces the overhead of storing each value + https://redis.io/topics/memory-optimization +*/ +const getSentenceKey = (sentence: string): [string, string] => { + const hash = createHash('md5').update(sentence).digest('base64'); + // Uses top 18 bits as bucket, and next 36 as key + // Will create around 260k buckets, each containing a few hundred items with the full dataset + return ['s' + hash.slice(0, 3), hash.slice(3, 9)]; +}; +export const Sentence = { + get: (sentence: string): Promise => redis.hGet(...getSentenceKey(sentence)), + set: (sentence: string, card: number): Promise => redis.hSet(...getSentenceKey(sentence), card), +}; + +export const Info = { + get: (cardId: number, field: 'p' | 'l'): Promise => redis.hGet(`i${cardId >> 8}`, field + (cardId % 256)), + set: (cardId: number, field: 'p' | 'l', value: string | number): Promise => + redis.hSet(`i${cardId >> 8}`, field + (cardId % 256), value), +}; + +export const setRedisParents = (cardIds: string[], parentId: number): Promise[] => + cardIds + .map((id) => Info.set(+id, 'p', parentId.toString())) // Update card infos with new parent + .concat(redis.sAdd(`c${parentId}`, cardIds)); // Add cards to parent's child list +export const getChildren = (cardId: string): Promise => redis.sMembers(`c${cardId}`) ?? Promise.resolve([]); + +export const getMatching = async (matches: (string | null)[]): Promise<(string | false)[]> => { + // If no matches + if (!matches.find((el) => el !== null)) return null; + + // Calculates length of match in case there is a gap due to typo or collision + const cards: CardMatches = {}; + for (let i = 0; i < matches.length; i++) { + const id = matches[i]; + if (id === null) continue; + // If new match, set current index as start and end at end of card, otherwise update end index + cards[id] ? (cards[id].end = i) : (cards[id] = { start: i, end: matches.length - 1 }); + } + + // Filter out probably false matches + return Promise.all( + toPairs(cards).map(async ([key, value]) => { + const { start, end } = value; + // If match starts at start or ends at end it is probably a real match + if (start >= EDGE_TOLERANCE || end >= matches.length - (EDGE_TOLERANCE + 1)) return key; + // If dosent reach start or end, it should be the entire card inside this one + return end - start - +(await Info.get(+key, 'l')) <= INSIDE_TOLERANCE && key; + }), + ); +}; diff --git a/src/lib/debate-tools/index.ts b/src/lib/debate-tools/index.ts index 965aeb9..2bfb923 100644 --- a/src/lib/debate-tools/index.ts +++ b/src/lib/debate-tools/index.ts @@ -4,3 +4,4 @@ export * from './styles'; export * from './tokens'; export * from './parse'; export * from './id'; +export * from './duplicate'; diff --git a/src/modules/deduplicator/index.ts b/src/modules/deduplicator/index.ts new file mode 100644 index 0000000..e0e988a --- /dev/null +++ b/src/modules/deduplicator/index.ts @@ -0,0 +1,62 @@ +import { db } from 'app/lib'; +import { getSentences, Sentence, Info, getChildren, getMatching, setRedisParents, DedupTask } from 'app/lib'; +import { onAddEvidence } from 'app/actions/addEvidence'; +import { filter, min, uniq } from 'lodash'; +import { Queue } from 'typescript-collections'; + +const evidenceQueue = new Queue(); + +// Update parents in database and redis, dont need to actaully wait for database response +function updateParents(cardIds: string[], parentId: number) { + setRedisParents(cardIds, parentId); + return db.evidence.updateMany({ where: { id: { in: cardIds.map(Number) } }, data: { parent: parentId } }); +} + +async function setParent({ text, id }: DedupTask) { + let parent = id; + const updates = [id.toString()]; + + const sentences = getSentences(text); + if (!sentences?.length) return updateParents(updates, parent); + + const existing = await Promise.all(sentences.map(Sentence.get)); + const matching = filter(await getMatching(existing)); + + Info.set(id, 'l', sentences.length); + if (matching.length) { + // Get the parents of all the matches, use set to make sure they are unique + const matchParents = uniq(await Promise.all(matching.map((card) => Info.get(+card, 'p')))); + + // If all matches have the same parent just set as parent + if (matchParents.length === 1) parent = +matchParents[0]; + else { + // In rare case multiple different parents were matched, merge cards and update parents + parent = +min(matchParents); + + await Promise.all( + matchParents + .filter((card) => +card !== parent) + .map((card) => getChildren(card).then((children) => updates.push(...children))), + ); + } + } + + // Commands will be sent in order so dont need to wait for respones + sentences.forEach((sentence) => Sentence.set(sentence, parent)); + return updateParents(uniq(updates), parent); +} + +onAddEvidence.on((data) => evidenceQueue.enqueue(data)); +const drain = () => { + // TODO: Add chunks of unduplicated cards from db if queue is empty + if (evidenceQueue.size() === 0) setTimeout(drain, 1000); + // Dosent actually wait for parent to be set, just till commands are sent + else { + const task = evidenceQueue.dequeue(); + const promise = setParent(task); + promise.then(task.callback); + promise.then(drain); + } +}; + +drain(); diff --git a/yarn.lock b/yarn.lock index cd63601..9131d85 100644 --- a/yarn.lock +++ b/yarn.lock @@ -512,6 +512,41 @@ call-me-maybe "^1.0.1" glob-to-regexp "^0.3.0" +"@node-redis/bloom@1.0.1": + version "1.0.1" + resolved "https://registry.yarnpkg.com/@node-redis/bloom/-/bloom-1.0.1.tgz#144474a0b7dc4a4b91badea2cfa9538ce0a1854e" + integrity sha512-mXEBvEIgF4tUzdIN89LiYsbi6//EdpFA7L8M+DHCvePXg+bfHWi+ct5VI6nHUFQE5+ohm/9wmgihCH3HSkeKsw== + +"@node-redis/client@1.0.4": + version "1.0.4" + resolved "https://registry.yarnpkg.com/@node-redis/client/-/client-1.0.4.tgz#fe185750df3bcc07524f63fe8dbc8d14d22d6cbb" + integrity sha512-IM/NRAqg7MvNC3bIRQipXGrEarunrdgvrbAzsd3ty93LSHi/M+ybQulOERQi8a3M+P5BL8HenwXjiIoKm6ml2g== + dependencies: + cluster-key-slot "1.1.0" + generic-pool "3.8.2" + redis-parser "3.0.0" + yallist "4.0.0" + +"@node-redis/graph@1.0.0": + version "1.0.0" + resolved "https://registry.yarnpkg.com/@node-redis/graph/-/graph-1.0.0.tgz#baf8eaac4a400f86ea04d65ec3d65715fd7951ab" + integrity sha512-mRSo8jEGC0cf+Rm7q8mWMKKKqkn6EAnA9IA2S3JvUv/gaWW/73vil7GLNwion2ihTptAm05I9LkepzfIXUKX5g== + +"@node-redis/json@1.0.2": + version "1.0.2" + resolved "https://registry.yarnpkg.com/@node-redis/json/-/json-1.0.2.tgz#8ad2d0f026698dc1a4238cc3d1eb099a3bee5ab8" + integrity sha512-qVRgn8WfG46QQ08CghSbY4VhHFgaTY71WjpwRBGEuqGPfWwfRcIf3OqSpR7Q/45X+v3xd8mvYjywqh0wqJ8T+g== + +"@node-redis/search@1.0.3": + version "1.0.3" + resolved "https://registry.yarnpkg.com/@node-redis/search/-/search-1.0.3.tgz#7c3d026bf994caf82019fd0c3924cfc09f041a29" + integrity sha512-rsrzkGWI84di/uYtEctS/4qLusWt0DESx/psjfB0TFpORDhe7JfC0h8ary+eHulTksumor244bXLRSqQXbFJmw== + +"@node-redis/time-series@1.0.2": + version "1.0.2" + resolved "https://registry.yarnpkg.com/@node-redis/time-series/-/time-series-1.0.2.tgz#5dd3638374edd85ebe0aa6b0e87addc88fb9df69" + integrity sha512-HGQ8YooJ8Mx7l28tD7XjtB3ImLEjlUxG1wC1PAjxu6hPJqjPshUZxAICzDqDjtIbhDTf48WXXUcx8TQJB1XTKA== + "@nodelib/fs.stat@^1.1.2": version "1.1.3" resolved "https://registry.yarnpkg.com/@nodelib/fs.stat/-/fs.stat-1.1.3.tgz#2b5a3ab3f918cca48a8c754c08168e3f03eba61b" @@ -1866,6 +1901,11 @@ cliui@^6.0.0: strip-ansi "^6.0.0" wrap-ansi "^6.2.0" +cluster-key-slot@1.1.0: + version "1.1.0" + resolved "https://registry.yarnpkg.com/cluster-key-slot/-/cluster-key-slot-1.1.0.tgz#30474b2a981fb12172695833052bc0d01336d10d" + integrity sha512-2Nii8p3RwAPiFwsnZvukotvow2rIHM+yQ6ZcBXGHdniadkYGZYiGmkHJIbZPIV9nfv7m/U1IPMVVcAhoWFeklw== + co@^4.6.0: version "4.6.0" resolved "https://registry.yarnpkg.com/co/-/co-4.6.0.tgz#6ea6bdf3d853ae54ccb8e47bfa0bf3f9031fb184" @@ -2938,6 +2978,11 @@ gauge@~2.7.3: strip-ansi "^3.0.1" wide-align "^1.1.0" +generic-pool@3.8.2: + version "3.8.2" + resolved "https://registry.yarnpkg.com/generic-pool/-/generic-pool-3.8.2.tgz#aab4f280adb522fdfbdc5e5b64d718d3683f04e9" + integrity sha512-nGToKy6p3PAbYQ7p1UlWl6vSPwfwU6TMSWK7TTu+WUY4ZjyZQGniGGt2oNVvyNSpyZYSB43zMXVLcBm08MTMkg== + gensync@^1.0.0-beta.2: version "1.0.0-beta.2" resolved "https://registry.yarnpkg.com/gensync/-/gensync-1.0.0-beta.2.tgz#32a6ee76c3d7f52d46b2b1ae5d93fea8580a25e0" @@ -5278,6 +5323,30 @@ redent@^1.0.0: indent-string "^2.1.0" strip-indent "^1.0.1" +redis-errors@^1.0.0: + version "1.2.0" + resolved "https://registry.yarnpkg.com/redis-errors/-/redis-errors-1.2.0.tgz#eb62d2adb15e4eaf4610c04afe1529384250abad" + integrity sha1-62LSrbFeTq9GEMBK/hUpOEJQq60= + +redis-parser@3.0.0: + version "3.0.0" + resolved "https://registry.yarnpkg.com/redis-parser/-/redis-parser-3.0.0.tgz#b66d828cdcafe6b4b8a428a7def4c6bcac31c8b4" + integrity sha1-tm2CjNyv5rS4pCin3vTGvKwxyLQ= + dependencies: + redis-errors "^1.0.0" + +redis@^4.0.4: + version "4.0.4" + resolved "https://registry.yarnpkg.com/redis/-/redis-4.0.4.tgz#b567f82f59086df38433982f7f424b48e924ec7a" + integrity sha512-KaM1OAj/nGrSeybmmOWSMY0LXTGT6FVWgUZZrd2MYzXKJ+VGtqVaciGQeNMfZiQX+kDM8Ke4uttb54m2rm6V0A== + dependencies: + "@node-redis/bloom" "1.0.1" + "@node-redis/client" "1.0.4" + "@node-redis/graph" "1.0.0" + "@node-redis/json" "1.0.2" + "@node-redis/search" "1.0.3" + "@node-redis/time-series" "1.0.2" + regenerate@^1.2.1: version "1.4.2" resolved "https://registry.yarnpkg.com/regenerate/-/regenerate-1.4.2.tgz#b9346d8827e8f5a32f7ba29637d398b69014848a" @@ -6533,16 +6602,16 @@ y18n@^4.0.0: resolved "https://registry.yarnpkg.com/y18n/-/y18n-4.0.3.tgz#b5f259c82cd6e336921efd7bfd8bf560de9eeedf" integrity sha512-JKhqTOwSrqNA1NY5lSztJ1GrBiUodLMmIZuLiDaMRJ+itFd+ABVE8XBjOvIWL+rSqNDC74LCSFmlb/U4UZ4hJQ== +yallist@4.0.0, yallist@^4.0.0: + version "4.0.0" + resolved "https://registry.yarnpkg.com/yallist/-/yallist-4.0.0.tgz#9bb92790d9c0effec63be73519e11a35019a3a72" + integrity sha512-3wdGidZyq5PB084XLES5TpOSRA3wjXAlIWMhum2kRcv/41Sn2emQ0dycQW4uZXLejwKvg6EsvbdlVL+FYEct7A== + yallist@^3.0.0, yallist@^3.0.3: version "3.1.1" resolved "https://registry.yarnpkg.com/yallist/-/yallist-3.1.1.tgz#dbb7daf9bfd8bac9ab45ebf602b8cbad0d5d08fd" integrity sha512-a4UGQaWPH59mOXUYnAG2ewncQS4i4F43Tv3JoAM+s2VDAmS9NsK8GpDMLrCHPksFT7h3K6TOoUNn2pb7RoXx4g== -yallist@^4.0.0: - version "4.0.0" - resolved "https://registry.yarnpkg.com/yallist/-/yallist-4.0.0.tgz#9bb92790d9c0effec63be73519e11a35019a3a72" - integrity sha512-3wdGidZyq5PB084XLES5TpOSRA3wjXAlIWMhum2kRcv/41Sn2emQ0dycQW4uZXLejwKvg6EsvbdlVL+FYEct7A== - yargs-parser@^18.1.2: version "18.1.3" resolved "https://registry.yarnpkg.com/yargs-parser/-/yargs-parser-18.1.3.tgz#be68c4975c6b2abf469236b0c870362fab09a7b0"