-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'deduplication' into scraper
- Loading branch information
Showing
14 changed files
with
484 additions
and
130 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
// Max number of files being parsed concurrently, allows parsing to take place while waiting for database response | ||
export const CONCURRENT_PARSERS = 10; | ||
|
||
/* Allow small differences in matching cards to help with things like part of the cite being attached to the start of the card */ | ||
// If a card has EDGE_TOLERANCE different sentences at start or end, will be treated as if they matched all the way to the start or end | ||
export const EDGE_TOLERANCE = 1; | ||
// If a card has almost an entire card within it, with at most INSIDE_TOLERANCE sentences missing from the start or end, it will be treated as if the entire card matched | ||
export const INSIDE_TOLERANCE = 1; | ||
/* | ||
Regex used to split text into sentences | ||
Matches puncuation followed by (whitespace + capital letter) and allows citiation numbers (ex. Sample text.123 Next sentence) | ||
Will fail in some weird cases, but should be good enough | ||
*/ | ||
export const SENTENCE_REGEX = /([.?!])+(?=\d*\s+[A-Z])/; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,13 +1,21 @@ | ||
import { PrismaClient } from '@prisma/client'; | ||
import { createClient } from 'redis'; | ||
|
||
// add prisma to the NodeJS global type | ||
// add prisma and redis to the NodeJS global type | ||
interface CustomNodeJsGlobal extends NodeJS.Global { | ||
prisma: PrismaClient; | ||
redis: ReturnType<typeof createClient>; | ||
} | ||
|
||
// Prevent multiple instances of Prisma Client in development | ||
// Prevent multiple instances of databases in development | ||
declare const global: CustomNodeJsGlobal; | ||
|
||
export const db = global.prisma || new PrismaClient(); | ||
export const redis = global.redis || createClient(); | ||
|
||
if (process.env.NODE_ENV === 'development') global.prisma = db; | ||
redis.connect(); | ||
|
||
if (process.env.NODE_ENV === 'development') { | ||
global.prisma = db; | ||
global.redis = redis; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
import { toPairs } from 'lodash'; | ||
import { redis } from 'app/lib/db'; | ||
import { createHash } from 'crypto'; | ||
import { EDGE_TOLERANCE, INSIDE_TOLERANCE, SENTENCE_REGEX } from 'app/constants'; | ||
|
||
type CardMatches = Record<number, { start: number; end: number }>; | ||
export interface DedupTask { | ||
text: string; | ||
id: number; | ||
callback: (value: unknown) => void; | ||
} | ||
|
||
export const getSentences = (text: string, cutoff = 20): string[] | undefined => { | ||
return text | ||
?.split(SENTENCE_REGEX) | ||
.map((el) => el.replace(/[^A-Z]/gi, '').toLowerCase()) | ||
.filter((el: string) => el.length >= cutoff); | ||
}; | ||
|
||
/* | ||
Small hashes are stored in a memory efficient way in redis | ||
Storing data in buckets using hashes drastically reduces the overhead of storing each value | ||
https://redis.io/topics/memory-optimization | ||
*/ | ||
const getSentenceKey = (sentence: string): [string, string] => { | ||
const hash = createHash('md5').update(sentence).digest('base64'); | ||
// Uses top 18 bits as bucket, and next 36 as key | ||
// Will create around 260k buckets, each containing a few hundred items with the full dataset | ||
return ['s' + hash.slice(0, 3), hash.slice(3, 9)]; | ||
}; | ||
export const Sentence = { | ||
get: (sentence: string): Promise<string> => redis.hGet(...getSentenceKey(sentence)), | ||
set: (sentence: string, card: number): Promise<number> => redis.hSet(...getSentenceKey(sentence), card), | ||
}; | ||
|
||
export const Info = { | ||
get: (cardId: number, field: 'p' | 'l'): Promise<string> => redis.hGet(`i${cardId >> 8}`, field + (cardId % 256)), | ||
set: (cardId: number, field: 'p' | 'l', value: string | number): Promise<number> => | ||
redis.hSet(`i${cardId >> 8}`, field + (cardId % 256), value), | ||
}; | ||
|
||
export const setRedisParents = (cardIds: string[], parentId: number): Promise<number>[] => | ||
cardIds | ||
.map((id) => Info.set(+id, 'p', parentId.toString())) // Update card infos with new parent | ||
.concat(redis.sAdd(`c${parentId}`, cardIds)); // Add cards to parent's child list | ||
export const getChildren = (cardId: string): Promise<string[]> => redis.sMembers(`c${cardId}`) ?? Promise.resolve([]); | ||
|
||
export const getMatching = async (matches: (string | null)[]): Promise<(string | false)[]> => { | ||
// If no matches | ||
if (!matches.find((el) => el !== null)) return null; | ||
|
||
// Calculates length of match in case there is a gap due to typo or collision | ||
const cards: CardMatches = {}; | ||
for (let i = 0; i < matches.length; i++) { | ||
const id = matches[i]; | ||
if (id === null) continue; | ||
// If new match, set current index as start and end at end of card, otherwise update end index | ||
cards[id] ? (cards[id].end = i) : (cards[id] = { start: i, end: matches.length - 1 }); | ||
} | ||
|
||
// Filter out probably false matches | ||
return Promise.all( | ||
toPairs(cards).map(async ([key, value]) => { | ||
const { start, end } = value; | ||
// If match starts at start or ends at end it is probably a real match | ||
if (start >= EDGE_TOLERANCE || end >= matches.length - (EDGE_TOLERANCE + 1)) return key; | ||
// If dosent reach start or end, it should be the entire card inside this one | ||
return end - start - +(await Info.get(+key, 'l')) <= INSIDE_TOLERANCE && key; | ||
}), | ||
); | ||
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.