Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open evidence and Wiki scraper #12

Draft
wants to merge 53 commits into
base: v3
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
53 commits
Select commit Hold shift + click to select a range
6f214b5
Add project info to Readme
arvind-balaji Jan 4, 2021
42cf220
Rewrite documentToTokens to use document.xml
D0ugins Oct 17, 2021
c2be8f4
Rewrite tokensToMarkup to use string
D0ugins Oct 18, 2021
68b84aa
Fix formatting
D0ugins Oct 26, 2021
7ab51bc
Fix incorrect formatting in some documents
D0ugins Dec 20, 2021
d5ca2bf
Fix detection of headings on some documents
D0ugins Dec 20, 2021
b5fa204
Fix debugging
D0ugins Jan 21, 2022
d70f4c0
Implement generateFile action
D0ugins Jan 21, 2022
b32188d
Fix prisma output location
D0ugins Jan 22, 2022
da8afb2
Merge branch 'parser-rewrite' into generateFile
D0ugins Jan 22, 2022
9eb51c1
Add dev script
D0ugins Jan 23, 2022
b975145
Add downloading round data
D0ugins Feb 15, 2022
525246d
Add open source downloading
D0ugins Feb 17, 2022
9925a48
Switch to downloading through api
D0ugins Feb 17, 2022
873fcdd
Add openev downloading
D0ugins Feb 18, 2022
acfc4d7
Fix crash on invalid document
D0ugins Feb 19, 2022
ff11639
Fix cite downloading
D0ugins Feb 20, 2022
00bbc15
Fix closing of tags at the end of cards
D0ugins Feb 20, 2022
ffc663c
Switch to 64 bit file ids
D0ugins Feb 20, 2022
88a3545
Fix out of memory error
D0ugins Mar 8, 2022
b40f02f
Merge branch 'v3' into scraper
D0ugins Mar 21, 2022
2eaeabd
Merge branch 'scraper' of github.com:D0ugins/debate-cards into scraper
D0ugins Mar 21, 2022
5d02ef2
Fix out of memory error
D0ugins Mar 21, 2022
fb0aab8
Parser improvements
D0ugins Mar 23, 2022
23b23ee
Implement card deduplication
D0ugins Mar 29, 2022
13a9f38
Merge branch 'deduplication' into scraper
D0ugins Mar 30, 2022
589b0d0
Decouple parser and dedup module
arvind-balaji May 13, 2022
2fcc715
Merge branch 'deduplication' of https://github.com/d0ugins/debate-car…
arvind-balaji May 13, 2022
13a1324
Add EvidenceBucket entity
arvind-balaji May 13, 2022
7dc1ec5
Clean up deduplication code
D0ugins May 13, 2022
a6a0d37
Switch to loading text during drain
D0ugins May 14, 2022
6d3f295
Fix issues with detecting false matches in deduplication
D0ugins May 15, 2022
11ff1ab
Implement concurrent deduplication
D0ugins May 16, 2022
1acf284
Speed up simplify tokens
D0ugins May 16, 2022
ee66830
Improve cite detection
D0ugins May 16, 2022
f2f668f
Merge branch 'citeFix' into deduplication
D0ugins May 16, 2022
98ee8bd
Merge branch 'deduplication' into scraper
D0ugins May 16, 2022
e533326
Merge branch 'scraper' of github.com:D0ugins/debate-cards into scraper
D0ugins May 16, 2022
aa130db
Change sentence storage method
D0ugins May 22, 2022
49d9e6c
change locations of deduplication functions
D0ugins May 23, 2022
505b5c2
Improve match filtering
D0ugins May 24, 2022
cc78244
Only return needed fields from db queries
D0ugins May 31, 2022
ed763da
Fix failed upserts on evidenceBuckets
D0ugins May 31, 2022
626c6ef
Factor out action queue logic
D0ugins Jun 1, 2022
cbcc6a7
Merge branch 'deduplication' into scraper
D0ugins Jun 1, 2022
0ca3011
Restructure wiki modules
D0ugins Jun 1, 2022
daedad6
Add zod types for wiki api
D0ugins Jun 1, 2022
482d806
Improve error messages
D0ugins Jun 1, 2022
b3955a5
Add strongly typed api responses
D0ugins Jun 1, 2022
77d56c0
Fix wiki downloading
D0ugins Jun 2, 2022
d5b89f8
Merge branch 'scraper' of github.com:D0ugins/debate-cards into scraper
D0ugins Jun 13, 2022
e401ede
Merge branch 'v3' into scraper
D0ugins Jun 25, 2022
b0ecca8
Fix round-file relation
D0ugins Jul 17, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 2 additions & 9 deletions .env.sample
Original file line number Diff line number Diff line change
@@ -1,9 +1,2 @@
API_PREFIX=/v1

SOLR_PORT=<MY_SOLR_PORT>
SOLR_CORE=<MY_SOLR_CORE>
SOLR_HOST=<MY_SOLR_HOST>

MONGO_DEV_CONN_URL=mongodb://<MY_MONGO_INSTANCE>
MONGO_LOCAL_CONN_URL=mongodb://<MY_MONGO_INSTANCE>
MONGO_DB_NAME=<MY_DB_NAME>
DATABASE_URL=postgresql://username:password@localhost:5432/debate-cards
DOCUMENT_PATH=./documents
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,5 @@ prisma/migrations
!.yarn/sdks
!.yarn/versions
*.docx
*.html
*.html
tmp
12 changes: 5 additions & 7 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,16 @@
},
"dependencies": {
"@prisma/client": "^3.8.1",
"axios": "^0.25.0",
"cheerio": "1.0.0-rc.3",
"docx": "^6.0.3",
"dotenv": "^6.2.0",
"htmlparser2": "^7.2.0",
"lodash": "^4.17.15",
"mammoth": "^1.4.19",
"node-pandoc-promise": "^0.0.6",
"p-ratelimit": "^1.0.1",
"redis": "^4.0.4",
"sqlite3": "^5.0.0",
"tmp-promise": "^3.0.2",
"typescript-collections": "^1.3.3",
"unzipper": "^0.10.11"
"unzipper": "^0.10.11",
"zod": "^3.17.3"
},
"devDependencies": {
"@types/cheerio": "^0.22.21",
Expand All @@ -50,7 +47,8 @@
"prettier": "^2.0.5",
"prisma": "^3.8.1",
"ts-node-dev": "^1.0.0-pre.50",
"tsconfig-paths": "^3.12.0",
"tscpaths": "^0.0.9",
"typescript": "^4.5.2"
}
}
}
34 changes: 33 additions & 1 deletion prisma/schema.prisma
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,32 @@ model Evidence {
root EvidenceBucket? @relation("root")
}

model Round {
id Int @id @default(autoincrement())
createdAt DateTime @default(now())
updatedAt DateTime @updatedAt
gid String @unique
status FileStatus @default(PENDING)

wiki String
school String
team String
side Side

entryDate DateTime
tournament String
roundNum String
opponent String
judge String

roundReport String?
cites String[]

openSourceUrl String?
openSource File? @relation(fields: [openSourceId], references: [gid])
openSourceId String?
}

model File {
id Int @id @default(autoincrement())
createdAt DateTime @default(now())
Expand All @@ -63,19 +89,25 @@ model File {

evidence Evidence[]
tags Tags[]
Rounds Round[]
}

model EvidenceSet {
id Int @id @default(autoincrement())
createdAt DateTime @default(now())
updatedAt DateTime @updatedAt

name String
name String @unique
label String

files File[]
}

enum Side {
AFF
NEG
}

enum FileStatus {
PENDING
PROCESSING
Expand Down
5 changes: 2 additions & 3 deletions src/actions/addFile.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,10 @@ export type FileData = Omit<Prisma.FileCreateInput, ExcludedFileFields>;

export const onAddFile = new TypedEvent<{ gid: string }>();

export default async (data: FileData): Promise<any> => {
export default async (data: FileData): Promise<string> => {
const buffer = await readFile(data.path);

const { fileId: gid } = await makeId(buffer);
console.log(data.name, Date.now());
const doc = await db.file.upsert({
where: {
gid,
Expand All @@ -32,5 +31,5 @@ export default async (data: FileData): Promise<any> => {

onAddFile.emit({ gid: doc.gid });

return;
return doc.gid;
};
28 changes: 28 additions & 0 deletions src/actions/addRound.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import { db, TypedEvent } from 'app/lib';
import { loadRound, RoundInfo } from 'app/lib/debate-tools/wiki';
import path from 'path';
import { DownloadInfo } from './downloadFile';

export const onAddRound = new TypedEvent<DownloadInfo>();
export default async ({ url, roundId, gid }: RoundInfo): Promise<{ gid: string }> => {
const existing = await db.round.findUnique({ where: { gid } });
if (existing) return existing;

const data = { gid, ...(await loadRound(url, roundId)) };
if (data.status === 'PENDING') {
const { wiki, school, team, side, tournament, roundNum } = data;
onAddRound.emit({
url: data.openSourceUrl,
filePath: path.join(process.env.DOCUMENT_PATH, wiki, school, team, side, `${tournament}-Round${roundNum}.docx`),
evidenceSet: wiki,
roundGid: gid,
});
}

return db.round.upsert({
where: { gid },
create: data,
update: data,
select: { gid: true },
});
};
38 changes: 17 additions & 21 deletions src/actions/dedupeFile.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,28 +3,24 @@ import { findParent } from 'app/lib/debate-tools';
const updateLock: Record<number, Lock> = {};

export default async ({ gid }: { gid: string }): Promise<any> => {
try {
const { id, fulltext } = await db.evidence.findUnique({ where: { gid }, select: { id: true, fulltext: true } });
const { updates, parent } = await findParent(id, fulltext);
const { id, fulltext } = await db.evidence.findUnique({ where: { gid }, select: { id: true, fulltext: true } });
const { updates, parent } = await findParent(id, fulltext);

// Only wait if it actually exists, otherwise it wont get set in time
if (updateLock[parent]) await updateLock[parent].promise;
const lock = (updateLock[parent] = new Lock());
// Only wait if it actually exists, otherwise it wont get set in time
if (updateLock[parent]) await updateLock[parent].promise;
const lock = (updateLock[parent] = new Lock());

Children.set(parent, updates);
const bucket = await db.evidenceBucket.upsert({
where: { rootId: parent },
create: { rootId: parent },
update: { count: { increment: updates.length } },
});
await db.evidence.updateMany({
where: { id: { in: updates.map(Number) } },
data: { bucketId: bucket.id },
});
Children.set(parent, updates);
const bucket = await db.evidenceBucket.upsert({
where: { rootId: parent },
create: { rootId: parent },
update: { count: { increment: updates.length } },
});
await db.evidence.updateMany({
where: { id: { in: updates.map(Number) } },
data: { bucketId: bucket.id },
});

lock.unlock();
if (updateLock[parent] === lock) delete updateLock[parent];
} catch (e) {
console.error(e);
}
lock.unlock();
if (updateLock[parent] === lock) delete updateLock[parent];
};
32 changes: 32 additions & 0 deletions src/actions/downloadFile.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import { db } from 'app/lib/db';
import { wikiDownload } from 'app/lib/request';
import addFile from 'app/actions/addFile';
import path from 'path';

export interface DownloadInfo {
url: string;
filePath: string;
evidenceSet: string;
roundGid?: string;
}

export default async ({ url, filePath, evidenceSet, roundGid }: DownloadInfo): Promise<void> => {
try {
if (!url) return;
const result = await wikiDownload(url, filePath);
if ('err' in result) throw new Error(`Failed to download file ${url}: ${result.err.message}`);
const fileId = await addFile({
name: path.basename(filePath),
path: filePath,
evidenceSet: { connect: { name: evidenceSet } },
});
if (roundGid)
await db.round.update({
where: { gid: roundGid },
data: { openSource: { connect: { gid: fileId } }, status: 'PROCESSED' },
});
} catch (e) {
if (roundGid) await db.round.update({ where: { gid: roundGid }, data: { status: 'ERROR' } });
throw new Error(`Error downloading ${url}: ${e.message}`);
}
};
4 changes: 2 additions & 2 deletions src/actions/parseFile.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import { db, pipe } from 'app/lib';

import { documentToTokens, extractCards, makeChildId } from 'app/lib/debate-tools';

export default async ({ gid }: { gid: string }) => {
export default async ({ gid }: { gid: string }): Promise<void> => {
try {
const cards = await pipe(
(gid: string) => db.file.findUnique({ where: { gid }, select: { path: true } }),
Expand All @@ -16,7 +16,7 @@ export default async ({ gid }: { gid: string }) => {
for (const card of cards) await addEvidence({ ...card, gid: makeChildId(gid, card.index), file: { gid } });
await db.file.update({ where: { gid }, data: { status: 'PROCESSED' } });
} catch (e) {
console.error(e);
await db.file.update({ where: { gid }, data: { status: 'ERROR' } });
throw new Error(`Error parsing ${gid}: ${e.message}`);
}
};
2 changes: 2 additions & 0 deletions src/constants/index.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
export * from './wiki';

// Max number of files being parsed concurrently, allows parsing to take place while waiting for database response
export const CONCURRENT_PARSERS = 10;
// Max number of cards being deduplicated concurrently
Expand Down
126 changes: 126 additions & 0 deletions src/constants/wiki.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
import { z } from 'zod';

// Regex to get year from wiki name like hspolicy21
export const WIKI_NAME_REGEX = /^(?<type>[a-z]+)(?<year>\d+)?$/;
export const WIKITYPES = {
hspolicy: 'High School Policy',
hsld: 'High School LD',
hspf: 'High School PF',
opencaselist: 'College Policy',
openev: 'Open Evidence',
nfald: 'College LD',
} as const;

/*
Types for wiki api
Base Xwiki schema defined here https://github.com/xwiki/xwiki-platform/blob/master/xwiki-platform-core/xwiki-platform-rest/xwiki-platform-rest-model/src/main/resources/xwiki.rest.model.xsd
*/
const links = z.array(
z.object({
href: z.string().url(),
rel: z.string(),
type: z.string().nullable(),
hrefLang: z.null().nullable(),
}),
);
const summary = {
links,
id: z.string(),
guid: z.string(),
pageId: z.string(),
pageVersion: z.string(),
wiki: z.string(),
space: z.string(),
pageName: z.string(),
pageAuthor: z.string(),
pageAuthorName: z.string().nullable(),
className: z.string(),
number: z.number(),
headline: z.string().nullable(),
};

export const OBJECT_SUMMARIES = z.object({
links,
objectSummaries: z.array(z.object(summary)),
});

export const SPACES = z.object({
links,
spaces: z.array(
z.object({
links,
id: z.string(),
wiki: z.string(),
name: z.string(),
home: z.string(),
xwikiRelativeUrl: z.string(),
xwikiAbsoluteUrl: z.string().url(),
}),
),
});

export const WIKIS = z.object({
links,
wikis: z.array(
z.object({
links,
id: z.string(),
name: z.string(),
description: z.string().nullable(),
owner: z.string().nullable(),
}),
),
});

export const ATTACHMENTS = z.object({
links,
attachments: z.array(
z.object({
links,
id: z.string(),
name: z.string(),
size: z.number(),
longSize: z.number(),
version: z.string(),
pageId: z.string(),
pageVersion: z.string(),
mimeType: z.string(),
author: z.string(),
authorName: z.string().nullable(),
date: z.number(),
xwikiRelativeUrl: z.string(),
xwikiAbsoluteUrl: z.string().url(),
hierarchy: z.object({
items: z.array(z.object({ label: z.string(), name: z.string(), type: z.string(), url: z.string().url() })),
}),
}),
),
});

const property = <T extends string>(name: T) =>
z.object({
links,
name: z.literal(name),
value: z.string(),
type: z.string(),
});

// Info about specific objects found by querying https://openev.debatecoaches.org/rest/wikis/hspolicy21/classes/classname
export const ROUND = z.object({
...summary,
// Using .map messes up types
properties: z.tuple([
property('Cites'),
property('EntryDate'),
property('Judge'),
property('OpenSource'),
property('Opponent'),
property('Round'),
property('RoundID'),
property('RoundReport'),
property('Tags'),
property('Tournament'),
property('Video'),
]),
});
export const CITE = property('Cites');
Loading