-
Notifications
You must be signed in to change notification settings - Fork 26
/
run.py
202 lines (165 loc) · 6.29 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
import os
import click
from kg_covid_19 import download as kg_download
from kg_covid_19 import transform as kg_transform
from kg_covid_19.make_holdouts import make_holdouts
from kg_covid_19.merge_utils.merge_kg import load_and_merge
from kg_covid_19.query import parse_query_rq, result_dict_to_tsv, run_query
from kg_covid_19.transform import DATA_SOURCES
@click.group()
def cli():
pass
@cli.command()
@click.option(
"yaml_file",
"-y",
required=True,
default="download.yaml",
type=click.Path(exists=True),
)
@click.option("output_dir", "-o", required=True, default="data/raw")
@click.option(
"ignore_cache",
"-i",
is_flag=True,
default=False,
help="ignore cache and download files even if they exist [false]",
)
def download(*args, **kwargs) -> None:
"""Downloads data files from list of URLs (default: download.yaml) into data
directory (default: data/raw).
Args:
yaml_file: Specify the YAML file containing a list of datasets to download.
output_dir: A string pointing to the directory to download data to.
ignore_cache: If specified, will ignore existing files and download again.
Returns:
None.
"""
kg_download(*args, **kwargs)
return None
@cli.command()
@click.option("input_dir", "-i", default="data/raw", type=click.Path(exists=True))
@click.option("output_dir", "-o", default="data/transformed")
@click.option(
"sources", "-s", default=None, multiple=True, type=click.Choice(DATA_SOURCES.keys())
)
def transform(*args, **kwargs) -> None:
"""Calls scripts in kg_covid_19/transform/[source name]/ to transform each source
into nodes and edges.
Args:
input_dir: A string pointing to the directory to import data from.
output_dir: A string pointing to the directory to output data to.
sources: A list of sources to transform.
Returns:
None.
"""
# call transform script for each source
kg_transform(*args, **kwargs)
return None
@cli.command()
@click.option("yaml", "-y", default="merge.yaml", type=click.Path(exists=True))
@click.option("processes", "-p", default=1, type=int)
def merge(yaml: str, processes: int) -> None:
"""Use KGX to load subgraphs to create a merged graph.
Args:
yaml: A string pointing to a KGX compatible config YAML.
processes: Number of processes to use.
Returns:
None.
"""
load_and_merge(yaml, processes)
@cli.command()
@click.option("yaml", "-y", required=True, default=None, multiple=False)
@click.option("output_dir", "-o", default="data/queries/")
def query(
yaml: str,
output_dir: str,
query_key: str = "query",
endpoint_key: str = "endpoint",
outfile_ext: str = ".tsv",
) -> None:
"""Perform a query of knowledge graph using a class contained in query_utils
Args:
yaml: A rq file containing a SPARQL query in grlc format:
https://github.com/CLARIAH/grlc/blob/master/README.md
output_dir: Directory to output results of query
query_key: the key in the yaml file containing the query string
endpoint_key: the key in the yaml file containing the sparql endpoint URL
outfile_ext: file extension for output file [.tsv]
Returns:
None.
"""
query = parse_query_rq(yaml)
result_dict = run_query(query=query[query_key], endpoint=query[endpoint_key])
if not os.path.exists(output_dir):
os.makedirs(output_dir)
outfile = os.path.join(
output_dir, os.path.splitext(os.path.basename(yaml))[0] + outfile_ext
)
result_dict_to_tsv(result_dict, outfile)
@cli.command()
@click.option(
"nodes",
"-n",
help="nodes KGX TSV file",
default="data/merged/nodes.tsv",
type=click.Path(exists=True),
)
@click.option(
"edges",
"-e",
help="edges KGX TSV file",
default="data/merged/edges.tsv",
type=click.Path(exists=True),
)
@click.option(
"output_dir",
"-o",
help="output directory",
default="data/holdouts/",
type=click.Path(),
)
@click.option(
"train_fraction",
"-t",
help="fraction of input graph to use in training graph [0.8]",
default=0.8,
type=float,
)
@click.option(
"validation", "-v", help="make validation set", is_flag=True, default=False
)
def holdouts(*args, **kwargs) -> None:
"""Make holdouts for ML training
Given a graph (from formatted node and edge TSVs), output positive edges and negative
edges for use in machine learning.
\f
To generate positive edges: a set of test positive edges equal in number to
[(1 - train_fraction) * number of edges in input graph] are randomly selected from
the edges in the input graph that is not part of a minimal spanning tree, such that
removing the edge does not create new components. These edges are emitting as
positive test edges. (If -v == true, the test positive edges are divided equally to
yield test and validation positive edges.) These edges are then removed from the
edges of the input graph, and these are emitted as the training edges.
Negative edges are selected by randomly selecting pairs of nodes that are not
connected by an edge in the input graph. The number of negative edges emitted is
equal to the number of positive edges emitted above.
Outputs these files in [output_dir]:
pos_train_edges.tsv - positive edges for training (this is the input graph with
test [and validation] positive edges removed)
pos_test_edges.tsv - positive edges for testing
pos_valid_edges.tsv (optional) - positive edges for validation
neg_train.tsv - a set of edges not present in input graph for training
neg_test.tsv - a set of edges not present in input graph for testing
neg_valid.tsv (optional) - a set of edges not present in input graph for
validation
Args:
:param nodes: nodes for input graph, in KGX TSV format [data/merged/nodes.tsv]
:param edges: edges for input graph, in KGX TSV format [data/merged/edges.tsv]
:param output_dir: directory to output edges and new graph [data/edges/]
:param train_fraction: fraction of edges to emit as training [0.8]
:param validation: should we make validation edges? [False]
"""
make_holdouts(*args, **kwargs)
if __name__ == "__main__":
cli()