-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutil.py
77 lines (63 loc) · 2.88 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# third-party packages
from Bio.SeqFeature import FeatureLocation
import pandas as pd
# =========================== FILE WRITING/READING ===========================
def write_fasta_lines(seq_tups, fpath):
"""
Given a list of 2-tuples (seq_id, seq), write to a FASTA file at fpath
"""
# make sure IDs are unique
assert len(seq_tups) == len(set([tup[0] for tup in seq_tups]))
with open(fpath, 'w') as f:
f.write('\n'.join([f'>{seq_id}\n{insert_line_breaks(seq)}' for seq_id, seq in seq_tups]))
def insert_line_breaks(s, n_char_per_line=80):
subs = []
for i_line in range(0, len(s), n_char_per_line):
subs.append(s[i_line:i_line+n_char_per_line])
return '\n'.join(subs)
GFF_COL_NAMES = [
'sequence_id', 'source', 'feature_type', 'left', 'right',
'score', 'strand', 'phase', 'attributes'
]
def load_gff(gff_path):
with open(gff_path, 'r') as f:
current_line = f.readline()
n_rows_to_skip = 0
# skip the header
while current_line[0] == '#' or current_line.strip() == '\n':
n_rows_to_skip += 1
current_line = f.readline()
# now we can use pandas directly
df = pd.read_csv(gff_path, skiprows=n_rows_to_skip, names=GFF_COL_NAMES, sep='\t')
return df
# ============================= SEQUENCES =============================
# BioPython's handy FeatureLocation object ONLY accepts Python integers; cast to these
# make sure to handle the fact that bitome/GFFs uses 1-indexing
def get_sequence(extract_seq, left, right, strand):
# handle the "wraparound" case - this would be a case where left is negative, or
# right is greater than seq length - the latter can easily be made the former
# the range has circled all the way back around
seq_len = len(extract_seq)
# case where it's actually just all at the right end
if left < 1 and right < 1:
return FeatureLocation(int(seq_len + left - 1), int(seq_len + right), int(strand)).extract(extract_seq)
# true wraparound cases
# no way both are true - that would be more than the whole sequence
elif left < 1 or right > seq_len:
# these cases can be interconverted - consolidate
if right > seq_len:
left, right = left - seq_len, right - seq_len
far_right_wraparound = FeatureLocation(
int(seq_len + left - 1), seq_len, int(strand)
).extract(extract_seq)
far_left_continued = FeatureLocation(
0, int(right), int(strand)
).extract(extract_seq)
# have to stitch the sequences together differently depending on the strand
if strand == 1:
return far_right_wraparound + far_left_continued
else:
return far_left_continued + far_right_wraparound
# normal case
else:
return FeatureLocation(int(left)-1, int(right), int(strand)).extract(extract_seq)