-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathd3m_downloader.py
102 lines (87 loc) · 3.16 KB
/
d3m_downloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import requests
import os
from bs4 import BeautifulSoup
import json
import subprocess
import shutil
COHORT_URLS = {
'seed': "https://datadrivendiscovery.org/data/seed_datasets_current/",
'll0': "https://datadrivendiscovery.org/data/training_datasets/LL0/",
'll1': "https://datadrivendiscovery.org/data/training_datasets/LL1/"
}
def get_page(url):
return requests.get(
url, headers={'Authorization': os.environ['TOKEN']})
def make_dataset_doc_link(root, name):
return os.path.join(root,name, "{name}_dataset/datasetDoc.json".format(name=name))
def save_dataset_docs(ds_cohort, output_file):
root = COHORT_URLS[ds_cohort]
response = get_page(root)
soup = BeautifulSoup(response.text, "html.parser")
links = soup.find_all('li')
datasets = {}
for l in links:
dataset_name = l.a.get('href').replace('/', '')
dataset_doc_link = make_dataset_doc_link(root, dataset_name)
print(dataset_doc_link)
try:
dataset_doc = get_page(dataset_doc_link).json()
except:
continue
else:
datasets[dataset_name] = dataset_doc
with open(output_file, 'w') as d:
json.dump(datasets, d)
# save_dataset_docs(ll0_training_datasets_url, 'll0_dataset_docs.json')
# save_dataset_docs(seed_dataset_url, 'seed_dataset_docs.json')
def download_dataset_from_name(name, output_dir):
ds_cohort = 'seed'
if name.lower().startswith('ll0'):
ds_cohort = 'll0'
elif name.lower().startswith('ll1'):
ds_cohort = 'll1'
root = COHORT_URLS[ds_cohort]
if os.path.exists(output_dir):
replace = input("{} exists, remove and replace it? y/n".format(output_dir))
if replace:
shutil.rmtree(output_dir)
url = os.path.join(root, name) + '/'
token = os.environ['TOKEN']
download_cmd = [
'wget', '-q', '-r', '-np', '-R',
'index.html*', '-nH', '--header',
"Authorization:{}".format(token),
url
]
subprocess.run(download_cmd)
root_on_disk = root.replace('https://datadrivendiscovery.org/', '')
shutil.move(os.path.join(root_on_disk, name), output_dir)
shutil.rmtree('data')
def check_type(dataset, dtype):
if dtype == 'tabular':
return check_tabular(dataset)
for res in dataset['dataResources']:
if dtype == res['resType']:
return True
def check_tabular(dataset):
all_table = True
for res in dataset['dataResources']:
if res['resType'] != 'table':
all_table = False
break
return all_table
def download_dataset_from_type(types_doc, ds_cohort, dtype, output_dir,
max_datasets=None):
root = COHORT_URLS[ds_cohort]
if not os.path.exists(output_dir):
os.mkdir(output_dir)
downloaded = 0
with open(types_doc) as f:
d = json.load(f)
for name, dataset in d.items():
if check_type(dataset, dtype):
print("Downloading {}".format(name))
download_dataset_from_name(name, os.path.join(output_dir, name))
downloaded += 1
if max_datasets is not None and downloaded >= max_datasets:
return