-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathget_hypothesis_notes.py
116 lines (84 loc) · 3.5 KB
/
get_hypothesis_notes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import os
from dotenv import load_dotenv, find_dotenv, set_key
load_dotenv()
import json
import string
import numpy as np
import pandas as pd
from dateutil.parser import parse
ROOT = os.path.dirname(os.path.abspath(__file__))
with open('annotations.json', 'r', encoding='utf-8') as j:
contents = json.loads(j.read())
# pull last import date from .env file
hypothesis_last_pull = os.getenv('hypothesis_last_pull')
print("hypothesis - Last date pulled: ", hypothesis_last_pull)
# if None, set to 1990
if hypothesis_last_pull is None:
hypothesis_last_pull = "1990-01-01"
# filter annotations based on last pulled date
contents['annotations'] = [i for i in contents['annotations'] if parse(i['updated'][:10])>=parse(hypothesis_last_pull)]
print("new notes: ", len(contents['annotations']))
all_notes =[]
# extract annotation info from each hypothesis document
for i in range(len(contents['annotations'])):
anno = contents['annotations'][i]
created = anno['updated']
if len(anno['document'])==0:
title = created[:10]+"-"+"no-title"
else:
title = anno['document']['title'][0]
title = title.translate(str.maketrans('', '', string.punctuation)).lower()
title = (created[:10]+"-"+title).replace(" ", "-")
context_href = anno['links']['incontext']
uri = anno['uri']
tags = anno['tags']
try:
highlights = [i['exact'] for i in anno['target'][0]['selector'] if 'exact' in i.keys()][0]
except:
print(i)
print(anno['target'])
note = anno['text']
n ={}
date = created[:10]
tags = "#"+' #'.join(tags)
url = context_href
title = title
n['title'] = title
n['tags'] = tags
n['date'] = date
n['url'] = url
n['highlights'] = "> " +highlights +" s\n" +note
n['uri'] = uri
all_notes.append(n)
# create dataframe and unify notes with same title / date pairs
df = pd.DataFrame(all_notes).groupby(["title","date","uri"])['highlights'].apply(list).reset_index(name='highlights')
df['tags'] = pd.DataFrame(all_notes).groupby(["title","date"])['tags'].apply(lambda x: list(np.unique(x))).reset_index(name='tags')['tags'].values
df['url'] = pd.DataFrame(all_notes).groupby(["title","date"])['url'].apply(lambda x: list(np.unique(x))).reset_index(name='url')['url'].values
# update last pull in .env file
last_pull = max(df['date'])
print(last_pull)
# bundle new notes to unique folder
base_path = os.path.join(ROOT, "out", "hypothesis", last_pull)
os.makedirs(base_path, exist_ok=True)
# create markdown files for each document with highlights and notes
for i,note_file in df.iterrows():
title = note_file['title']
tags = note_file['tags']
date = note_file['date']
uri = note_file['uri']
with open(os.path.join(base_path, title+'.md'), 'w', encoding='utf-8') as out:
title_line= "# "+title[11:]+"\n\n"
tag_line = "tags: "+ " ".join([i for i in " ".join(tags).split(" ") if len(i)>1])+"\n"
uri_line = "uri: ["+title[11:]+"]("+uri+")\n"
date_line = "date: "+ date+"\n"
high_line = ""
for index,high in enumerate(note_file['highlights']):
high_line += high +"\n"
high_line += "[hypothesis ref]("+note_file['url'][0]+")\n\n ----\n"
out.writelines([title_line,
tag_line,
uri_line,
date_line,
"### highlight:\n",high_line])
dotenv_file = find_dotenv()
set_key(dotenv_file, "hypothesis_last_pull", last_pull)