Merge branch 'deduplication' into scraper

OpenDebate · Mar 30, 2022 · c6f7e14 · c6f7e14
2 parents 5d02ef2 + 23b23ee
commit c6f7e14
Show file tree

Hide file tree

Showing 14 changed files with 484 additions and 130 deletions.
diff --git a/package.json b/package.json
@@ -15,9 +15,11 @@
     "cheerio": "1.0.0-rc.3",
     "docx": "^6.0.3",
     "dotenv": "^6.2.0",
+    "htmlparser2": "^7.2.0",
     "lodash": "^4.17.15",
+    "redis": "^4.0.4",
+    "typescript-collections": "^1.3.3",
     "unzipper": "^0.10.11"
-    "typescript-collections": "^1.3.3"
   },
   "devDependencies": {
     "@types/cheerio": "^0.22.21",
@@ -48,4 +50,4 @@
     "tscpaths": "^0.0.9",
     "typescript": "^4.5.2"
   }
-}
+}
diff --git a/prisma/schema.prisma b/prisma/schema.prisma
@@ -15,6 +15,7 @@ model Evidence {
 
   tag      String
   cite     String?
+  fullcite String?
   summary  String?
   spoken   String?
   fulltext String?
@@ -25,6 +26,7 @@ model Evidence {
 
   file   File? @relation(fields: [fileId], references: [id])
   fileId Int?
+  parent Int? // Null if unprocessed, same as id if no parent
 }
 
 model Round {

diff --git a/src/actions/addEvidence.ts b/src/actions/addEvidence.ts
@@ -1,11 +1,12 @@
 import { Prisma } from '@prisma/client';
-import { db } from 'app/lib';
+import { db, DedupTask, TypedEvent } from 'app/lib';
 import { omit } from 'lodash';
 
 type EvidenceData = Omit<Prisma.EvidenceCreateInput, 'file'> & { file: Prisma.FileWhereUniqueInput };
+export const onAddEvidence = new TypedEvent<DedupTask>();
 
 export default async (data: EvidenceData): Promise<void> => {
-  await db.evidence.upsert({
+  const evidence = await db.evidence.upsert({
     where: {
       gid: data.gid,
     },
@@ -22,5 +23,6 @@ export default async (data: EvidenceData): Promise<void> => {
     },
   });
 
+  await new Promise((resolve) => onAddEvidence.emit({ text: evidence.fulltext, id: evidence.id, callback: resolve }));
   return;
 };
diff --git a/src/actions/generateFile.ts b/src/actions/generateFile.ts
@@ -37,13 +37,9 @@ const flattenLevel = (data: Evidence[], level: number): TextBlock[] => {
 };
 
 export default async (ids: number[], keepHeadings: boolean): Promise<Buffer> => {
-  let evidence = await db.evidence.findMany({
-    where: {
-      id: { in: ids },
-    },
-  });
+  const evidence = await db.evidence.findMany({ where: { id: { in: ids } } });
 
-  let tokens: TextBlock[] = flattenLevel(evidence, keepHeadings ? 1 : 4);
+  const tokens: TextBlock[] = flattenLevel(evidence, keepHeadings ? 1 : 4);
   onGenerateFile.emit({ ids });
   return await tokensToDocument(tokens);
 };
diff --git a/src/constants/index.ts b/src/constants/index.ts
@@ -0,0 +1,14 @@
+// Max number of files being parsed concurrently, allows parsing to take place while waiting for database response
+export const CONCURRENT_PARSERS = 10;
+
+/* Allow small differences in matching cards to help with things like part of the cite being attached to the start of the card */
+// If a card has EDGE_TOLERANCE different sentences at start or end, will be treated as if they matched all the way to the start or end
+export const EDGE_TOLERANCE = 1;
+// If a card has almost an entire card within it, with at most INSIDE_TOLERANCE sentences missing from the start or end, it will be treated as if the entire card matched
+export const INSIDE_TOLERANCE = 1;
+/* 
+  Regex used to split text into sentences 
+  Matches puncuation followed by (whitespace + capital letter) and allows citiation numbers (ex. Sample text.123 Next sentence)
+  Will fail in some weird cases, but should be good enough
+*/
+export const SENTENCE_REGEX = /([.?!])+(?=\d*\s+[A-Z])/;
diff --git a/src/lib/db.ts b/src/lib/db.ts
@@ -1,13 +1,21 @@
 import { PrismaClient } from '@prisma/client';
+import { createClient } from 'redis';
 
-// add prisma to the NodeJS global type
+// add prisma and redis to the NodeJS global type
 interface CustomNodeJsGlobal extends NodeJS.Global {
   prisma: PrismaClient;
+  redis: ReturnType<typeof createClient>;
 }
 
-// Prevent multiple instances of Prisma Client in development
+// Prevent multiple instances of databases in development
 declare const global: CustomNodeJsGlobal;
 
 export const db = global.prisma || new PrismaClient();
+export const redis = global.redis || createClient();
 
-if (process.env.NODE_ENV === 'development') global.prisma = db;
+redis.connect();
+
+if (process.env.NODE_ENV === 'development') {
+  global.prisma = db;
+  global.redis = redis;
+}
diff --git a/src/lib/debate-tools/document.ts b/src/lib/debate-tools/document.ts
@@ -10,7 +10,7 @@ import fs from 'fs';
 export const documentToTokens = async (filepath: string): Promise<TextBlock[]> => {
   const document = await loadXml(filepath, /document\.xml$/);
   const styles = await loadXml(filepath, /styles\.xml$/);
-  const tokens = markupToTokens(document, styles, { simplified: true });
+  const tokens = await markupToTokens(document, styles, { simplified: true });
   return tokens;
 };
 
@@ -31,10 +31,10 @@ const loadXml = (path: string, file: RegExp): Promise<string> => {
     let data = '';
     const stream = fs.createReadStream(path);
     stream
-      .on('error', (err) => reject(err))
+      .on('error', reject)
       .pipe(ParseOne(file))
       .on('data', (chunk) => (data += chunk))
-      .on('error', (err) => reject(err))
+      .on('error', reject)
       .on('end', () => resolve(data));
   });
 };
diff --git a/src/lib/debate-tools/duplicate.ts b/src/lib/debate-tools/duplicate.ts
@@ -0,0 +1,71 @@
+import { toPairs } from 'lodash';
+import { redis } from 'app/lib/db';
+import { createHash } from 'crypto';
+import { EDGE_TOLERANCE, INSIDE_TOLERANCE, SENTENCE_REGEX } from 'app/constants';
+
+type CardMatches = Record<number, { start: number; end: number }>;
+export interface DedupTask {
+  text: string;
+  id: number;
+  callback: (value: unknown) => void;
+}
+
+export const getSentences = (text: string, cutoff = 20): string[] | undefined => {
+  return text
+    ?.split(SENTENCE_REGEX)
+    .map((el) => el.replace(/[^A-Z]/gi, '').toLowerCase())
+    .filter((el: string) => el.length >= cutoff);
+};
+
+/* 
+  Small hashes are stored in a memory efficient way in redis
+  Storing data in buckets using hashes drastically reduces the overhead of storing each value
+  https://redis.io/topics/memory-optimization
+*/
+const getSentenceKey = (sentence: string): [string, string] => {
+  const hash = createHash('md5').update(sentence).digest('base64');
+  // Uses top 18 bits as bucket, and next 36 as key
+  // Will create around 260k buckets, each containing a few hundred items with the full dataset
+  return ['s' + hash.slice(0, 3), hash.slice(3, 9)];
+};
+export const Sentence = {
+  get: (sentence: string): Promise<string> => redis.hGet(...getSentenceKey(sentence)),
+  set: (sentence: string, card: number): Promise<number> => redis.hSet(...getSentenceKey(sentence), card),
+};
+
+export const Info = {
+  get: (cardId: number, field: 'p' | 'l'): Promise<string> => redis.hGet(`i${cardId >> 8}`, field + (cardId % 256)),
+  set: (cardId: number, field: 'p' | 'l', value: string | number): Promise<number> =>
+    redis.hSet(`i${cardId >> 8}`, field + (cardId % 256), value),
+};
+
+export const setRedisParents = (cardIds: string[], parentId: number): Promise<number>[] =>
+  cardIds
+    .map((id) => Info.set(+id, 'p', parentId.toString())) // Update card infos with new parent
+    .concat(redis.sAdd(`c${parentId}`, cardIds)); // Add cards to parent's child list
+export const getChildren = (cardId: string): Promise<string[]> => redis.sMembers(`c${cardId}`) ?? Promise.resolve([]);
+
+export const getMatching = async (matches: (string | null)[]): Promise<(string | false)[]> => {
+  // If no matches
+  if (!matches.find((el) => el !== null)) return null;
+
+  // Calculates length of match in case there is a gap due to typo or collision
+  const cards: CardMatches = {};
+  for (let i = 0; i < matches.length; i++) {
+    const id = matches[i];
+    if (id === null) continue;
+    // If new match, set current index as start and end at end of card, otherwise update end index
+    cards[id] ? (cards[id].end = i) : (cards[id] = { start: i, end: matches.length - 1 });
+  }
+
+  // Filter out probably false matches
+  return Promise.all(
+    toPairs(cards).map(async ([key, value]) => {
+      const { start, end } = value;
+      // If match starts at start or ends at end it is probably a real match
+      if (start >= EDGE_TOLERANCE || end >= matches.length - (EDGE_TOLERANCE + 1)) return key;
+      // If dosent reach start or end, it should be the entire card inside this one
+      return end - start - +(await Info.get(+key, 'l')) <= INSIDE_TOLERANCE && key;
+    }),
+  );
+};
diff --git a/src/lib/debate-tools/index.ts b/src/lib/debate-tools/index.ts
@@ -4,3 +4,4 @@ export * from './styles';
 export * from './tokens';
 export * from './parse';
 export * from './id';
+export * from './duplicate';
diff --git a/src/lib/debate-tools/markup.ts b/src/lib/debate-tools/markup.ts
@@ -1,16 +1,9 @@
-import ch from 'cheerio';
-import {
-  TextBlock,
-  getStyleNameByXml,
-  TokenStyle,
-  SectionStyleName,
-  simplifyTokens,
-  tokensToDocument,
-  getOutlineLvlName,
-} from './';
+import { TextBlock, TokenStyle, TextToken, simplifyTokens, tokensToDocument } from './tokens';
+import { getStyleNameByXml, getOutlineLvlName } from './styles';
+import { Parser as XmlParser } from 'htmlparser2';
 
 export const markupToDocument = async (xml: string, styles: string): Promise<Buffer> => {
-  const tokens = markupToTokens(xml, styles, { simplified: true });
+  const tokens = await markupToTokens(xml, styles, { simplified: true });
   const buffer = await tokensToDocument(tokens);
   return buffer;
 };
@@ -19,71 +12,69 @@ interface TokensOption {
   simplified: boolean;
 }
 
-export const markupToTokens = (document: string, styles: string, options?: TokensOption): TextBlock[] => {
-  const blocks = tokenize(document, styles);
+export async function markupToTokens(document: string, styles: string, options?: TokensOption): Promise<TextBlock[]> {
+  const blocks = await tokenize(document, styles);
   if (options?.simplified) {
-    const simplifiedBlocks = blocks.map((block) => simplifyTokens(block));
+    const simplifiedBlocks = blocks.map(simplifyTokens);
     return simplifiedBlocks;
   }
   return blocks;
-};
-
-const getChild = (el, names: string[]) =>
-  names.reduce((acc, name) => {
-    return acc?.children?.find((child) => child.name === name);
-  }, el);
-
-// Extract what formatting applies to block of text
-const updateElFormating = (styleEl, current?: TokenStyle): TokenStyle => {
-  const formatting: TokenStyle = current ? { ...current } : { underline: false, strong: false, mark: false };
-  const styles = getChild(styleEl, ['w:rPr']);
-  if (!styles) return formatting;
-
-  const highlight = getChild(styles, ['w:highlight']);
-  const bold = getChild(styles, ['w:b']);
-  const underline = getChild(styles, ['w:u'])?.attribs['w:val'];
-
-  if (highlight) formatting.mark = true;
-  if (bold) formatting.strong = bold.attribs['w:val'] !== '0';
-  if (underline) formatting.underline = underline !== 'none';
-
-  return formatting;
-};
-
-const getBlockFormat = (block): SectionStyleName => {
-  const stlyeNameFormat = getStyleNameByXml(getChild(block, ['w:pPr', 'w:pStyle'])?.attribs['w:val']);
-  if (stlyeNameFormat !== 'text') return stlyeNameFormat;
+}
 
-  // Sometimes uses outline level instead of header
-  const outlineLvl = getChild(block, ['w:pPr', 'w:outlineLvl'])?.attribs['w:val'];
-  return getOutlineLvlName(parseInt(outlineLvl) + 1);
+const handleStyleTag = (name: string, attribs: Record<string, string>, styles: TokenStyle) => {
+  if (name === 'w:u') styles.underline = attribs['w:val'] !== 'none';
+  else if (name === 'w:highlight') styles.mark = true;
+  else if (name === 'w:b') styles.strong = attribs['w:val'] !== '0';
 };
 
-const tokenize = (xml: string, styles: string): TextBlock[] => {
-  const s = ch.load(styles, { xmlMode: true });
-  const d = ch.load(xml, { xmlMode: true });
+const parseStyles = (styles: string): Promise<Record<string, TokenStyle>> =>
+  new Promise((resolve, reject) => {
+    const parsedStyles: Record<string, TokenStyle> = {};
+    let styleName = '';
+    new XmlParser(
+      {
+        onopentag(name, attribs) {
+          if (name === 'w:style') {
+            styleName = attribs['w:styleId'];
+            parsedStyles[styleName] = { underline: false, strong: false, mark: false };
+          } else if (styleName) handleStyleTag(name, attribs, parsedStyles[styleName]);
+        },
+        onend: () => resolve(parsedStyles),
+        onerror: reject,
+      },
+      { xmlMode: true },
+    ).parseComplete(styles);
+  });
 
-  // Generate map of style names to formatting from styles.xml
-  const xmlStyles: Record<string, TokenStyle> = s('w\\:style')
-    .get()
-    .reduce((acc, node) => {
-      acc[node.attribs['w:styleId']] = updateElFormating(node);
-      return acc;
-    }, {});
-
-  const tokens: TextBlock[] = d('w\\:p')
-    .get()
-    .map((block) => ({
-      format: getBlockFormat(block),
-      tokens: ch(block)
-        .children('w\\:r')
-        .get()
-        .map((node) => ({
-          text: ch(node).text(),
-          // combine formatting defined in text block and formatting from style name
-          format: updateElFormating(node, xmlStyles[getChild(node, ['w:rPr', 'w:rStyle'])?.attribs['w:val']]),
-        })),
-    }));
-
-  return tokens;
-};
+const tokenize = (xml: string, styles: string): Promise<TextBlock[]> =>
+  new Promise((resolve, reject) => {
+    parseStyles(styles).then((styleData) => {
+      const blocks: TextBlock[] = [];
+      let block: TextBlock;
+      let token: TextToken;
+      new XmlParser(
+        {
+          onopentag(name, attribs) {
+            if (name === 'w:p') block = { format: 'text', tokens: [] };
+            else if (name === 'w:pStyle') block.format = getStyleNameByXml(attribs['w:val']);
+            else if (name === 'w:outlineLvl') block.format = getOutlineLvlName(+attribs['w:val'] + 1);
+            else if (name === 'w:r') token = { text: '', format: { underline: false, strong: false, mark: false } };
+            else if (token) {
+              if (name === 'w:rStyle') token.format = { ...styleData[attribs['w:val']] };
+              else handleStyleTag(name, attribs, token.format);
+            }
+          },
+          ontext(data) {
+            if (token) token.text += data;
+          },
+          onclosetag(name) {
+            if (name === 'w:p' && block.tokens.length) blocks.push(block);
+            else if (name === 'w:r' && token.text) block.tokens.push(token);
+          },
+          onend: () => resolve(blocks),
+          onerror: reject,
+        },
+        { xmlMode: true },
+      ).parseComplete(xml);
+    });
+  });
diff --git a/src/lib/debate-tools/parse.ts b/src/lib/debate-tools/parse.ts
@@ -24,10 +24,7 @@ export const getIndexesWith = (blocks: TextBlock[], styles: StyleName[]): number
 };
 
 const getLastBlockWith = (blocks: TextBlock[], anchor: number, styles: StyleName[]): TextBlock => {
-  let ret;
-  const range = [...Array(anchor).keys()];
-  range.forEach((idx) => (ret = styles.includes(blocks[idx].format) ? blocks[idx] : ret));
-  return ret;
+  for (let i = anchor; i >= 0; i--) if (styles.includes(blocks[i].format)) return blocks[i];
 };
 
 export const getBlocksUntil = (blocks: TextBlock[], anchor: number, styles: StyleName[]): TextBlock[] => {
@@ -51,6 +48,7 @@ const parseCard = (doc: TextBlock[], anchor = 0, idx: number) => {
   return {
     tag: extractText([tag]),
     cite: extractText([cite], ['strong']),
+    fullcite: extractText([cite]),
     pocket: extractHeading('pocket'),
     hat: extractHeading('hat'),
     block: extractHeading('block'),