This repository has been archived by the owner on Sep 8, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrelation.py
71 lines (57 loc) · 1.97 KB
/
relation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
"""
Basic idea is taking two groups of papers (domain and codomain),
and for each in domain find any references of papers in codomain.
"""
import pdf_feeder
import pdf_parser
def testload_domain():
local_feeder = pdf_feeder.LocalFeeder('/data/dtriac/dtra-covid/Papers', 'pdf')
domain = []
for paper in local_feeder.feed():
parsed = (pdf_parser.parse_pdf(paper))
if parsed is not None:
domain.append(parsed)
return domain
def testload_codomain():
local_feeder = pdf_feeder.LocalFeeder('./test', 'pdf')
codomain = []
for paper in local_feeder.feed():
parsed = pdf_parser.parse_pdf(paper)
if parsed is not None:
codomain.append(parsed)
return codomain
def load_domain():
arxiv_feeder = pdf_feeder.ArxivFeeder()
domain = []
for paper in arxiv_feeder.feed():
parsed = pdf_parser.parse_pdf(paper)
if parsed is not None:
domain.append(parsed)
return domain
def load_codomain():
local_feeder = pdf_feeder.LocalFeeder('/data/dtriac/dtriac-534/spv1-results', 'json')
codomain = []
for paper in local_feeder.feed():
parsed = pdf_parser.parse_json(paper)
if parsed is not None:
codomain.append(parsed)
return codomain
def find_citation(domain, codomain):
citations = {}
for new_paper in domain:
for old_paper in codomain:
if new_paper.cites(old_paper):
cited = citations.get(str(new_paper), [])
if str(old_paper) not in cited:
cited.append(str(old_paper))
citations[str(new_paper)] = cited
# print(cited)
# print(citations)
return citations
if __name__ == '__main__':
import json
dom = load_domain()
print(f'done reading domain, {len(dom)} documents loaded')
cod = load_codomain()
print(f'done reading codomain, {len(cod)} documents loaded')
print(json.dumps(find_citation(dom, cod), indent=2))