forked from RNAcentral/rnacentral-data-schema
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvalidate.py
executable file
·139 lines (110 loc) · 4.44 KB
/
validate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#!/usr/bin/env python
import os
import json
import logging
import click
import jsonschema as js
# import jsonref as jr
HERE = os.path.abspath(os.path.dirname(__file__))
SECTIONS = os.path.join(HERE, 'sections')
SCHEMA_NAME = 'rnacentral-schema.json'
LOGGER = logging.getLogger(__name__)
def validate_secondary_structure(data):
for ncrna in data['data']:
if 'secondaryStructure' not in ncrna:
continue
assert(len(ncrna['secondaryStructure']) == len(ncrna['sequence']))
def validate_is_known_global_ids(data):
with open('sections/data-provider.json', 'r') as raw:
known = json.load(raw)
known = set(known["properties"]['dataProvider']['enum'])
for ncrna in data["data"]:
gene_id = ncrna.get('gene', {}).get('geneId', None)
if gene_id:
name, _ = gene_id.split(':', 1)
assert name in known, "Unknown database: %s" % name
for global_id in ncrna.get('crossReferenceIds', []):
name, _ = global_id.split(':', 1)
assert name in known, "Xref to unknown db: %s" % name
# Not clear if this is actually needed.
def validate_trna_annotations(data):
for ncrna in data['data']:
isoType = ncrna.get('additionalAnnotations', {}).get('isoType', None)
anticodon = ncrna.get('sequenceFeatures', {}).get('anticodon', None)
if isoType or anticodon:
assert ncrna['soTermId'] == 'SO:0000253'
# Unsure if we should require this, maybe make it an option? I will leave the
# code here for now.
def validate_id_format(data):
expected = data['metaData']['dataProvider']
for ncrna in data['data']:
primary_id = ncrna['primaryId']
db, _ = primary_id.split(':', 1)
if db != expected:
msg = "Expected %s to start with %s" % (primary_id, expected)
raise js.ValidationError(msg)
gene_id = ncrna.get('gene', {}).get('geneId', None)
if gene_id:
gene_db = primary_id.split(':', 1)
assert gene_db == expected
def validate_can_produce_name(data):
for ncrna in data['data']:
name = None
if 'description' in ncrna and ncrna['description']:
LOGGER.debug("Using transcript description for name of %s",
ncrna['primaryId'])
name = ncrna['description']
if 'name' in ncrna and ncrna['name']:
LOGGER.debug("Using transcript name for name of %s",
ncrna['primaryId'])
name = ncrna['name']
if 'gene' in ncrna:
gene = ncrna['gene']
if 'name' in gene:
LOGGER.debug("Using gene name for name of %s",
ncrna['primaryId'])
name = gene['name']
if 'symbol' in gene:
LOGGER.debug("Using gene symbol for name of %s",
ncrna['primaryId'])
name = gene['symbol']
if name:
LOGGER.debug("Using name %s for %s", name, ncrna['primaryId'])
else:
raise ValueError("No name for %s", ncrna['primaryId'])
def validate_coordinate_direction(data):
for ncrna in data['data']:
for location in ncrna.get('genomeLocations', []):
for exon in location['exons']:
if exon['strand'] == '+' or exon['strand'] == '.' or \
exon['strand'] == '-':
assert exon['startPosition'] < exon['endPosition']
else:
raise ValueError("Shouldn't be here")
def validate(data, schema_path, sections_path):
with open(schema_path, 'r') as raw:
schema = json.load(raw)
base = 'file://%s/' % sections_path
js.validate(
data,
schema,
format_checker=js.FormatChecker(),
resolver=js.RefResolver(base, None),
)
validate_secondary_structure(data)
validate_is_known_global_ids(data)
validate_trna_annotations(data)
validate_can_produce_name(data)
validate_coordinate_direction(data)
@click.command()
@click.argument('filename')
@click.option('--schema', default=SCHEMA_NAME,
help='Filename of the schema to use')
@click.option('--sections', default=SECTIONS,
help='Directory where schema parts are kept')
def main(filename, schema=None, sections=None):
with open(filename, 'r') as raw:
data = json.load(raw)
validate(data, schema, os.path.abspath(sections))
if __name__ == '__main__':
main()