forked from yarden/MISO
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex_gff.py
120 lines (95 loc) · 4.16 KB
/
index_gff.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
##
## Script to build an indexed representation of a GFF file for efficient
## retrieval of genes
##
import os
import time
import glob
import shelve
from collections import defaultdict
import GFF as gff_utils
import pickle_utils
import Gene as gene_utils
def serialize_genes(gff_genes, output_dir):
"""
Output genes into pickle files by chromosome, by gene.
"""
genes_by_chrom = defaultdict(dict)
# Split up genes by chromosome
for gene_id, gene_info in gff_genes.iteritems():
gene_obj = gene_info["gene_object"]
gene_hierarchy = gene_info["hierarchy"]
genes_by_chrom[gene_obj.chrom][gene_id] = {'gene_object': gene_obj,
'hierarchy': gene_hierarchy}
# Mapping from gene IDs to pickled filename
gene_id_to_filename = {}
# Serialize all the genes in each chromosome into their
# own directory
for chrom, chrom_genes in genes_by_chrom.iteritems():
if chrom.startswith("chr"):
chrom_dir_name = chrom
else:
chrom_dir_name = "chr%s" %(str(chrom))
# Make directory for chromosome if it doesn't already exist
chrom_dir = os.path.join(output_dir, chrom_dir_name)
if not os.path.isdir(chrom_dir):
print "Making directory: %s" %(chrom_dir)
os.makedirs(chrom_dir)
t1 = time.time()
# Serialize each gene into a separate file
num_genes = len(genes_by_chrom[chrom])
for gene_id, gene_info in genes_by_chrom[chrom].iteritems():
gene_filename = os.path.abspath(os.path.join(chrom_dir,
"%s.pickle" %(gene_id)))
pickle_utils.write_pickled_file({gene_id: genes_by_chrom[chrom][gene_id]},
gene_filename)
# Record what filename was associated with this gene ID
gene_id_to_filename[gene_id] = gene_filename
t2 = time.time()
print " - Chromosome serialization took %.2f seconds" %(t2 - t1)
# Shelve the mapping from gene ids to filenames
shelved_filename = os.path.join(output_dir, "genes_to_filenames.shelve")
shelved_data = shelve.open(shelved_filename)
for k, v in gene_id_to_filename.iteritems():
shelved_data[k] = v
shelved_data.close()
def index_gff(gff_filename, output_dir):
"""
Index the given GFF and placed the indexed representation
in the output directory.
"""
print "Indexing GFF..."
# First check that the GFF is not already indexed
indexed_files = glob.glob(os.path.join(output_dir, "chr*"))
if len(indexed_files) >= 1:
print "%s appears to already be indexed. Aborting." %(gff_filename)
return
print " - GFF: %s" %(gff_filename)
print " - Outputting to: %s" %(output_dir)
overall_t1 = time.time()
t1 = time.time()
gff_genes = gene_utils.load_genes_from_gff(gff_filename)
t2 = time.time()
print " - Loading of genes from GFF took %.2f seconds" %(t2 - t1)
t1 = time.time()
# pickle_filename = os.path.join(output_dir,
# "%s.pickle" %(os.path.basename(os.path.abspath(gff_filename))))
# pickle_utils.write_pickled_file(gff_genes, pickle_filename)
serialize_genes(gff_genes, output_dir)
t2 = time.time()
print " - Serialization of genes from GFF took %.2f seconds" %(t2 - t1)
overall_t2 = time.time()
print "Indexing of GFF took %.2f seconds." %(overall_t2 - overall_t1)
def main():
from optparse import OptionParser
parser = OptionParser()
parser.add_option("--index", dest="index_gff", nargs=2, default=None,
help="Index the given GFF. Takes as arguments as GFF filename "
"and an output directory.")
(options, args) = parser.parse_args()
if options.index_gff != None:
gff_filename = os.path.abspath(os.path.expanduser(options.index_gff[0]))
output_dir = os.path.abspath(os.path.expanduser(options.index_gff[1]))
index_gff(gff_filename, output_dir)
if __name__ == '__main__':
main()