Skip to content

Commit

Permalink
Merge branch 'deduplication' into scraper
Browse files Browse the repository at this point in the history
  • Loading branch information
D0ugins committed Mar 30, 2022
2 parents 5d02ef2 + 23b23ee commit c6f7e14
Show file tree
Hide file tree
Showing 14 changed files with 484 additions and 130 deletions.
6 changes: 4 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,11 @@
"cheerio": "1.0.0-rc.3",
"docx": "^6.0.3",
"dotenv": "^6.2.0",
"htmlparser2": "^7.2.0",
"lodash": "^4.17.15",
"redis": "^4.0.4",
"typescript-collections": "^1.3.3",
"unzipper": "^0.10.11"
"typescript-collections": "^1.3.3"
},
"devDependencies": {
"@types/cheerio": "^0.22.21",
Expand Down Expand Up @@ -48,4 +50,4 @@
"tscpaths": "^0.0.9",
"typescript": "^4.5.2"
}
}
}
2 changes: 2 additions & 0 deletions prisma/schema.prisma
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ model Evidence {
tag String
cite String?
fullcite String?
summary String?
spoken String?
fulltext String?
Expand All @@ -25,6 +26,7 @@ model Evidence {
file File? @relation(fields: [fileId], references: [id])
fileId Int?
parent Int? // Null if unprocessed, same as id if no parent
}

model Round {
Expand Down
6 changes: 4 additions & 2 deletions src/actions/addEvidence.ts
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import { Prisma } from '@prisma/client';
import { db } from 'app/lib';
import { db, DedupTask, TypedEvent } from 'app/lib';
import { omit } from 'lodash';

type EvidenceData = Omit<Prisma.EvidenceCreateInput, 'file'> & { file: Prisma.FileWhereUniqueInput };
export const onAddEvidence = new TypedEvent<DedupTask>();

export default async (data: EvidenceData): Promise<void> => {
await db.evidence.upsert({
const evidence = await db.evidence.upsert({
where: {
gid: data.gid,
},
Expand All @@ -22,5 +23,6 @@ export default async (data: EvidenceData): Promise<void> => {
},
});

await new Promise((resolve) => onAddEvidence.emit({ text: evidence.fulltext, id: evidence.id, callback: resolve }));
return;
};
8 changes: 2 additions & 6 deletions src/actions/generateFile.ts
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,9 @@ const flattenLevel = (data: Evidence[], level: number): TextBlock[] => {
};

export default async (ids: number[], keepHeadings: boolean): Promise<Buffer> => {
let evidence = await db.evidence.findMany({
where: {
id: { in: ids },
},
});
const evidence = await db.evidence.findMany({ where: { id: { in: ids } } });

let tokens: TextBlock[] = flattenLevel(evidence, keepHeadings ? 1 : 4);
const tokens: TextBlock[] = flattenLevel(evidence, keepHeadings ? 1 : 4);
onGenerateFile.emit({ ids });
return await tokensToDocument(tokens);
};
14 changes: 14 additions & 0 deletions src/constants/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
// Max number of files being parsed concurrently, allows parsing to take place while waiting for database response
export const CONCURRENT_PARSERS = 10;

/* Allow small differences in matching cards to help with things like part of the cite being attached to the start of the card */
// If a card has EDGE_TOLERANCE different sentences at start or end, will be treated as if they matched all the way to the start or end
export const EDGE_TOLERANCE = 1;
// If a card has almost an entire card within it, with at most INSIDE_TOLERANCE sentences missing from the start or end, it will be treated as if the entire card matched
export const INSIDE_TOLERANCE = 1;
/*
Regex used to split text into sentences
Matches puncuation followed by (whitespace + capital letter) and allows citiation numbers (ex. Sample text.123 Next sentence)
Will fail in some weird cases, but should be good enough
*/
export const SENTENCE_REGEX = /([.?!])+(?=\d*\s+[A-Z])/;
14 changes: 11 additions & 3 deletions src/lib/db.ts
Original file line number Diff line number Diff line change
@@ -1,13 +1,21 @@
import { PrismaClient } from '@prisma/client';
import { createClient } from 'redis';

// add prisma to the NodeJS global type
// add prisma and redis to the NodeJS global type
interface CustomNodeJsGlobal extends NodeJS.Global {
prisma: PrismaClient;
redis: ReturnType<typeof createClient>;
}

// Prevent multiple instances of Prisma Client in development
// Prevent multiple instances of databases in development
declare const global: CustomNodeJsGlobal;

export const db = global.prisma || new PrismaClient();
export const redis = global.redis || createClient();

if (process.env.NODE_ENV === 'development') global.prisma = db;
redis.connect();

if (process.env.NODE_ENV === 'development') {
global.prisma = db;
global.redis = redis;
}
6 changes: 3 additions & 3 deletions src/lib/debate-tools/document.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import fs from 'fs';
export const documentToTokens = async (filepath: string): Promise<TextBlock[]> => {
const document = await loadXml(filepath, /document\.xml$/);
const styles = await loadXml(filepath, /styles\.xml$/);
const tokens = markupToTokens(document, styles, { simplified: true });
const tokens = await markupToTokens(document, styles, { simplified: true });
return tokens;
};

Expand All @@ -31,10 +31,10 @@ const loadXml = (path: string, file: RegExp): Promise<string> => {
let data = '';
const stream = fs.createReadStream(path);
stream
.on('error', (err) => reject(err))
.on('error', reject)
.pipe(ParseOne(file))
.on('data', (chunk) => (data += chunk))
.on('error', (err) => reject(err))
.on('error', reject)
.on('end', () => resolve(data));
});
};
71 changes: 71 additions & 0 deletions src/lib/debate-tools/duplicate.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import { toPairs } from 'lodash';
import { redis } from 'app/lib/db';
import { createHash } from 'crypto';
import { EDGE_TOLERANCE, INSIDE_TOLERANCE, SENTENCE_REGEX } from 'app/constants';

type CardMatches = Record<number, { start: number; end: number }>;
export interface DedupTask {
text: string;
id: number;
callback: (value: unknown) => void;
}

export const getSentences = (text: string, cutoff = 20): string[] | undefined => {
return text
?.split(SENTENCE_REGEX)
.map((el) => el.replace(/[^A-Z]/gi, '').toLowerCase())
.filter((el: string) => el.length >= cutoff);
};

/*
Small hashes are stored in a memory efficient way in redis
Storing data in buckets using hashes drastically reduces the overhead of storing each value
https://redis.io/topics/memory-optimization
*/
const getSentenceKey = (sentence: string): [string, string] => {
const hash = createHash('md5').update(sentence).digest('base64');
// Uses top 18 bits as bucket, and next 36 as key
// Will create around 260k buckets, each containing a few hundred items with the full dataset
return ['s' + hash.slice(0, 3), hash.slice(3, 9)];
};
export const Sentence = {
get: (sentence: string): Promise<string> => redis.hGet(...getSentenceKey(sentence)),
set: (sentence: string, card: number): Promise<number> => redis.hSet(...getSentenceKey(sentence), card),
};

export const Info = {
get: (cardId: number, field: 'p' | 'l'): Promise<string> => redis.hGet(`i${cardId >> 8}`, field + (cardId % 256)),
set: (cardId: number, field: 'p' | 'l', value: string | number): Promise<number> =>
redis.hSet(`i${cardId >> 8}`, field + (cardId % 256), value),
};

export const setRedisParents = (cardIds: string[], parentId: number): Promise<number>[] =>
cardIds
.map((id) => Info.set(+id, 'p', parentId.toString())) // Update card infos with new parent
.concat(redis.sAdd(`c${parentId}`, cardIds)); // Add cards to parent's child list
export const getChildren = (cardId: string): Promise<string[]> => redis.sMembers(`c${cardId}`) ?? Promise.resolve([]);

export const getMatching = async (matches: (string | null)[]): Promise<(string | false)[]> => {
// If no matches
if (!matches.find((el) => el !== null)) return null;

// Calculates length of match in case there is a gap due to typo or collision
const cards: CardMatches = {};
for (let i = 0; i < matches.length; i++) {
const id = matches[i];
if (id === null) continue;
// If new match, set current index as start and end at end of card, otherwise update end index
cards[id] ? (cards[id].end = i) : (cards[id] = { start: i, end: matches.length - 1 });
}

// Filter out probably false matches
return Promise.all(
toPairs(cards).map(async ([key, value]) => {
const { start, end } = value;
// If match starts at start or ends at end it is probably a real match
if (start >= EDGE_TOLERANCE || end >= matches.length - (EDGE_TOLERANCE + 1)) return key;
// If dosent reach start or end, it should be the entire card inside this one
return end - start - +(await Info.get(+key, 'l')) <= INSIDE_TOLERANCE && key;
}),
);
};
1 change: 1 addition & 0 deletions src/lib/debate-tools/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ export * from './styles';
export * from './tokens';
export * from './parse';
export * from './id';
export * from './duplicate';
133 changes: 62 additions & 71 deletions src/lib/debate-tools/markup.ts
Original file line number Diff line number Diff line change
@@ -1,16 +1,9 @@
import ch from 'cheerio';
import {
TextBlock,
getStyleNameByXml,
TokenStyle,
SectionStyleName,
simplifyTokens,
tokensToDocument,
getOutlineLvlName,
} from './';
import { TextBlock, TokenStyle, TextToken, simplifyTokens, tokensToDocument } from './tokens';
import { getStyleNameByXml, getOutlineLvlName } from './styles';
import { Parser as XmlParser } from 'htmlparser2';

export const markupToDocument = async (xml: string, styles: string): Promise<Buffer> => {
const tokens = markupToTokens(xml, styles, { simplified: true });
const tokens = await markupToTokens(xml, styles, { simplified: true });
const buffer = await tokensToDocument(tokens);
return buffer;
};
Expand All @@ -19,71 +12,69 @@ interface TokensOption {
simplified: boolean;
}

export const markupToTokens = (document: string, styles: string, options?: TokensOption): TextBlock[] => {
const blocks = tokenize(document, styles);
export async function markupToTokens(document: string, styles: string, options?: TokensOption): Promise<TextBlock[]> {
const blocks = await tokenize(document, styles);
if (options?.simplified) {
const simplifiedBlocks = blocks.map((block) => simplifyTokens(block));
const simplifiedBlocks = blocks.map(simplifyTokens);
return simplifiedBlocks;
}
return blocks;
};

const getChild = (el, names: string[]) =>
names.reduce((acc, name) => {
return acc?.children?.find((child) => child.name === name);
}, el);

// Extract what formatting applies to block of text
const updateElFormating = (styleEl, current?: TokenStyle): TokenStyle => {
const formatting: TokenStyle = current ? { ...current } : { underline: false, strong: false, mark: false };
const styles = getChild(styleEl, ['w:rPr']);
if (!styles) return formatting;

const highlight = getChild(styles, ['w:highlight']);
const bold = getChild(styles, ['w:b']);
const underline = getChild(styles, ['w:u'])?.attribs['w:val'];

if (highlight) formatting.mark = true;
if (bold) formatting.strong = bold.attribs['w:val'] !== '0';
if (underline) formatting.underline = underline !== 'none';

return formatting;
};

const getBlockFormat = (block): SectionStyleName => {
const stlyeNameFormat = getStyleNameByXml(getChild(block, ['w:pPr', 'w:pStyle'])?.attribs['w:val']);
if (stlyeNameFormat !== 'text') return stlyeNameFormat;
}

// Sometimes uses outline level instead of header
const outlineLvl = getChild(block, ['w:pPr', 'w:outlineLvl'])?.attribs['w:val'];
return getOutlineLvlName(parseInt(outlineLvl) + 1);
const handleStyleTag = (name: string, attribs: Record<string, string>, styles: TokenStyle) => {
if (name === 'w:u') styles.underline = attribs['w:val'] !== 'none';
else if (name === 'w:highlight') styles.mark = true;
else if (name === 'w:b') styles.strong = attribs['w:val'] !== '0';
};

const tokenize = (xml: string, styles: string): TextBlock[] => {
const s = ch.load(styles, { xmlMode: true });
const d = ch.load(xml, { xmlMode: true });
const parseStyles = (styles: string): Promise<Record<string, TokenStyle>> =>
new Promise((resolve, reject) => {
const parsedStyles: Record<string, TokenStyle> = {};
let styleName = '';
new XmlParser(
{
onopentag(name, attribs) {
if (name === 'w:style') {
styleName = attribs['w:styleId'];
parsedStyles[styleName] = { underline: false, strong: false, mark: false };
} else if (styleName) handleStyleTag(name, attribs, parsedStyles[styleName]);
},
onend: () => resolve(parsedStyles),
onerror: reject,
},
{ xmlMode: true },
).parseComplete(styles);
});

// Generate map of style names to formatting from styles.xml
const xmlStyles: Record<string, TokenStyle> = s('w\\:style')
.get()
.reduce((acc, node) => {
acc[node.attribs['w:styleId']] = updateElFormating(node);
return acc;
}, {});

const tokens: TextBlock[] = d('w\\:p')
.get()
.map((block) => ({
format: getBlockFormat(block),
tokens: ch(block)
.children('w\\:r')
.get()
.map((node) => ({
text: ch(node).text(),
// combine formatting defined in text block and formatting from style name
format: updateElFormating(node, xmlStyles[getChild(node, ['w:rPr', 'w:rStyle'])?.attribs['w:val']]),
})),
}));

return tokens;
};
const tokenize = (xml: string, styles: string): Promise<TextBlock[]> =>
new Promise((resolve, reject) => {
parseStyles(styles).then((styleData) => {
const blocks: TextBlock[] = [];
let block: TextBlock;
let token: TextToken;
new XmlParser(
{
onopentag(name, attribs) {
if (name === 'w:p') block = { format: 'text', tokens: [] };
else if (name === 'w:pStyle') block.format = getStyleNameByXml(attribs['w:val']);
else if (name === 'w:outlineLvl') block.format = getOutlineLvlName(+attribs['w:val'] + 1);
else if (name === 'w:r') token = { text: '', format: { underline: false, strong: false, mark: false } };
else if (token) {
if (name === 'w:rStyle') token.format = { ...styleData[attribs['w:val']] };
else handleStyleTag(name, attribs, token.format);
}
},
ontext(data) {
if (token) token.text += data;
},
onclosetag(name) {
if (name === 'w:p' && block.tokens.length) blocks.push(block);
else if (name === 'w:r' && token.text) block.tokens.push(token);
},
onend: () => resolve(blocks),
onerror: reject,
},
{ xmlMode: true },
).parseComplete(xml);
});
});
6 changes: 2 additions & 4 deletions src/lib/debate-tools/parse.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,7 @@ export const getIndexesWith = (blocks: TextBlock[], styles: StyleName[]): number
};

const getLastBlockWith = (blocks: TextBlock[], anchor: number, styles: StyleName[]): TextBlock => {
let ret;
const range = [...Array(anchor).keys()];
range.forEach((idx) => (ret = styles.includes(blocks[idx].format) ? blocks[idx] : ret));
return ret;
for (let i = anchor; i >= 0; i--) if (styles.includes(blocks[i].format)) return blocks[i];
};

export const getBlocksUntil = (blocks: TextBlock[], anchor: number, styles: StyleName[]): TextBlock[] => {
Expand All @@ -51,6 +48,7 @@ const parseCard = (doc: TextBlock[], anchor = 0, idx: number) => {
return {
tag: extractText([tag]),
cite: extractText([cite], ['strong']),
fullcite: extractText([cite]),
pocket: extractHeading('pocket'),
hat: extractHeading('hat'),
block: extractHeading('block'),
Expand Down
Loading

0 comments on commit c6f7e14

Please sign in to comment.