-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathosf_files.py
199 lines (166 loc) · 6.68 KB
/
osf_files.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
#!/usr/bin/env python3
import click
from datetime import datetime
from tqdm import tqdm
import re
import math
import requests
import os
import shutil
import os.path as op
import subprocess
import tarfile
import json
import pathlib
OSF_DOWNLOAD = "https://osf.io/{}/download"
OSF_METADATA = "https://osf.io/{}/metadata?format=datacite-json"
OSF_URL = {'2023-10-02_Winawer-lab-mtg': 'spu5e',
'2024-06-25_SAB': 'z8ryf',
'2024-07-12_CSHL': 'nvk85',
'2024-07-15_dana': 'qb9ec',
}
@click.group()
def cli():
pass
@click.command()
@click.argument('dir')
def package_assets(dir: str):
"""Package assets directory into a tar ball.
DIR is the path to the assets directory (e.g.,
2023-10-02_Winawer-lab-mtg/assets/)
"""
if dir[-1] == op.sep:
dir = dir[:-1]
output_dir = dir + '.tar.gz'
print(f"Packaging {dir} into {output_dir}")
with tarfile.open(output_dir, 'w:gz') as tar:
tar.add(dir)
@click.command()
@click.argument('tar_path')
def upload(tar_path: str):
"""Upload assets tarball to OSF
TAR_PATH is the path to the assets tarball (e.g.,
2023-10-02_Winawer-lab-mtg/assets.tar.gz)
Note that this requires the osf command line tool to be set up (see README
for details)
"""
tar_name = tar_path.replace(op.sep, '_')
subprocess.run(['osf', 'upload', '-f', tar_path, f"osfstorage/{tar_name}"])
def download_url(url: str, destination_path: str):
"""Helper function to download `url` to `destination_path`
"""
# Streaming, so we can iterate over the response.
r = requests.get(url, stream=True)
# Total size in bytes.
total_size = int(r.headers.get('content-length', 0))
block_size = 1024*1024
wrote = 0
with open(destination_path, 'wb') as f:
for data in tqdm(r.iter_content(block_size), unit='MB',
unit_scale=True,
total=math.ceil(total_size//block_size)):
wrote += len(data)
f.write(data)
if total_size != 0 and wrote != total_size:
raise Exception(f"Error downloading from {url}!")
def extract_tar(path: str):
"""Helper function to extract tarballs
"""
with tarfile.open(path) as f:
f.extractall(op.dirname(path))
os.remove(path)
def _get_date_modified(dir_path: str):
"""Gets date modified for OSF object
"""
url = OSF_URL[dir_path]
r = requests.get(OSF_METADATA.format(url))
meta = json.loads(r.text)
mod = [d for d in meta['dates'] if d['dateType'] == 'Updated']
if len(mod) != 1:
raise Exception(f"Unable to find date modified for {dir_path}!")
return mod[0]['date']
@click.command()
@click.argument('dir_path')
def get_date_modified(dir_path: str):
"""Gets date modified for OSF object
"""
# can't call the CLI command from within the script, so need to do this
date = _get_date_modified(dir_path)
click.echo(f"{dir_path} modified: {date}")
@click.command()
@click.argument('dir_path')
def download(dir_path: str):
"""Download assets tarball from OSF then extract and arrange files.
DIR_PATH is the path of the directory whose associated files you want to
download from the OSF (e.g., 2023-10-02_Winawer-lab-mtg/) or `all` (to
download all of them)
Note that this DOES NOT requires the osf command line tool to be set up.
"""
if dir_path != 'all':
urls = {dir_path: OSF_URL[dir_path]}
else:
urls = OSF_URL
for k, v in urls.items():
print(f"Downloading {k}")
download_url(OSF_DOWNLOAD.format(v), 'tmp.tar.gz')
remote_modified = _get_date_modified(k)
remote_modified = datetime.strptime(remote_modified, '%Y-%m-%d').timestamp()
if op.exists(op.join(k, 'assets')):
local_modified = op.getmtime(op.join(k, 'assets'))
if remote_modified < local_modified:
click.echo("You have local changes more recent than the remote tarball, do you wish to continue extracting?")
choice = input('y/n: ')
while choice not in ['y', 'n']:
click.echo("Please enter y or n")
if choice == 'n':
shutil.move('tmp.tar.gz', k + '.tar.gz')
click.echo(f"Tarball is located at {op.abspath(k + '.tar.gz')}, check its contents with "
"tar tvf PATH to see modification times and extract it yourself")
continue
extract_tar('tmp.tar.gz')
@click.command()
@click.argument('dir_path')
def check_assets(dir_path: str):
"""Check to make sure DIR_PATH contains all necessary assets
DIR_PATH is the path of the directory whose associated files you want to
check (e.g., 2023-10-02_Winawer-lab-mtg/) or `all` (to check all of them)
"""
if dir_path != 'all':
dir_path = [dir_path]
else:
all_dirs = [p.parent.name
for p in pathlib.Path(__file__).parent.glob('*/slides.md')]
not_in_osf_dict = set(all_dirs).difference(OSF_URL.keys())
not_a_directory = set(OSF_URL.keys()).difference(all_dirs)
if len(not_in_osf_dict) > 0:
to_print = "\n\t".join(not_in_osf_dict)
raise Exception("Following presentations aren't found in OSF_URL dict"
f", don't know how to download their assets!\n\t{to_print}")
if len(not_a_directory) > 0:
to_print = "\n\t".join(not_a_directory)
raise Exception("Following presentations are found in OSF_URL dict"
f", but not in repo!\n\t{to_print}")
dir_path = list(OSF_URL.keys())
for dir_p in dir_path:
with open(op.join(dir_p, 'slides.md')) as f:
slides = f.read()
slides_assets = re.findall('assets/[A-Za-z0-9_.-]+', slides)
slides_assets = [a.replace('assets/', '') for a in slides_assets]
local_assets = os.listdir(op.join(dir_p, 'assets'))
only_slides = [a for a in slides_assets if a not in local_assets]
if len(only_slides):
only_slides = '\n'.join(only_slides)
raise Exception(f"The following assets (for {dir_p}) are only in slides file (not in assets folder)!"
f"\n{only_slides}")
only_local = [a for a in local_assets if a not in slides_assets]
if len(only_local):
only_local = '\n'.join(only_local)
raise Exception(f"The following assets (for {dir_p}) are only in assets folder (not in slides file)!"
f"\n{only_local}")
cli.add_command(package_assets)
cli.add_command(upload)
cli.add_command(download)
cli.add_command(get_date_modified)
cli.add_command(check_assets)
if __name__ == '__main__':
cli()