Skip to content

Commit

Permalink
Merge pull request #2 from dhakim87/hashset_to_ranges
Browse files Browse the repository at this point in the history
Hashset to ranges
  • Loading branch information
adswafford authored May 18, 2021
2 parents d4127b5 + d6d855b commit e5c793a
Show file tree
Hide file tree
Showing 3 changed files with 134 additions and 5 deletions.
14 changes: 9 additions & 5 deletions calculate_coverages.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from os import path
import click
from glob import glob
from cover import SortedRangeList

@click.command()
@click.option('-i',"--input", required=True, help="Input: Directory of sam files (files must end in .sam).")
Expand All @@ -15,7 +16,7 @@ def calculate_coverages(input, output, database):
###################################
#Calculate coverage of each contig#
###################################
gotu_dict = defaultdict(set)
gotu_dict = defaultdict(SortedRangeList)
file_list = glob(input + "/*.sam")
for samfile in file_list:
with open(samfile.strip(), 'r') as open_sam_file:
Expand All @@ -28,8 +29,7 @@ def calculate_coverages(input, output, database):
length_string = linesplit[5]
length = sum([int(x) for x in re.split("[a-zA-Z]",length_string) if x])
#Add range to contig_dict
for i in range(location,location+length):
gotu_dict[gotu].add(i)
gotu_dict[gotu].add_range(location, location + length - 1)


###################################
Expand All @@ -43,8 +43,12 @@ def calculate_coverages(input, output, database):
#Calculate coverages#
#####################
#Make dataframe from dicitonary of coverages of each contig
cov = pd.DataFrame({"gotu":list(gotu_dict.keys()),
"covered_length" : [len(x) for x in gotu_dict.values()] } )
cov = pd.DataFrame(
{
"gotu": list(gotu_dict.keys()),
"covered_length": [x.compute_length() for x in gotu_dict.values()]
}
)
cov= cov.set_index("gotu")
cov = cov.sort_values("covered_length", ascending=False)
#Add genome metadata
Expand Down
57 changes: 57 additions & 0 deletions cover.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
class SortedRangeList:
def __init__(self, autocompress=100000):
self.ranges = []
self.autocompress = autocompress
self.since_compressed = 0

def add_range(self, start, end):
self.ranges.append((start, end))
self.since_compressed += 1
if self.autocompress is not None and \
self.since_compressed > self.autocompress:
self.compress()

def compress(self):
if self.since_compressed == 0:
return
# Sort ranges by start index
self.ranges.sort(key=lambda r: r[0])

new_ranges = []
start_val = None
end_val = None

for r in self.ranges:
if end_val is None:
# case 1: no active range, start active range.
start_val = r[0]
end_val = r[1]
elif end_val >= r[0] - 1:
# case 2: active range continues through this range
# extend active range
end_val = max(end_val, r[1])
else: # if end_val < r[0] - 1:
# case 3: active range ends before this range begins
# write new range out, then start new active range
new_range = (start_val, end_val)
new_ranges.append(new_range)
start_val = r[0]
end_val = r[1]

if end_val is not None:
new_range = (start_val, end_val)
new_ranges.append(new_range)

self.ranges = new_ranges
self.since_compressed = 0

def compute_length(self):
if self.since_compressed > 0:
self.compress()
total = 0
for r in self.ranges:
total += r[1] - r[0] + 1
return total

def __str__(self):
return str(self.ranges)
68 changes: 68 additions & 0 deletions cover_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
from cover import SortedRangeList
import random
from time import perf_counter


def simple_test():
srl = SortedRangeList(autocompress=None)
for i in range(9, 0, -2):
srl.add_range(i, i)
srl.add_range(4, 4)
srl.compress()
assert(str(srl) == "[(1, 1), (3, 5), (7, 7), (9, 9)]")


def stress_test(seed, num_reads):
intset = set()
srl = SortedRangeList(autocompress=None)

start_set = perf_counter()
random.seed(seed)
for i in range(num_reads):
read_start = random.randint(0, 10000000)
read_len = random.randint(85, 150)
for j in range(read_start, read_start + read_len):
intset.add(j)
print("SET_ADD: ", perf_counter() - start_set)

start_srl = perf_counter()
random.seed(seed)
for i in range(num_reads):
read_start = random.randint(0, 10000000)
read_len = random.randint(85, 150)
srl.add_range(read_start, read_start+read_len-1)
print("SRL_ADD: ", perf_counter() - start_srl)
srl.compress()
print("SRL_ADD AND COMPRESS: ", perf_counter() - start_srl)
srl_len = srl.compute_length()
print("SRL_ADD COMPRESS AND LENGTH: ", perf_counter() - start_srl)

print(srl_len)
print(len(intset))
assert(len(intset) == srl_len)
print(len(srl.ranges))


def multi_compress_test(seed, num_reads):
srl = SortedRangeList(autocompress=None)

for compress_count in [1000, 10000, 100000]:
start_srl = perf_counter()
random.seed(seed)
for i in range(num_reads):
read_start = random.randint(0, 10000000)
read_len = random.randint(85, 150)
srl.add_range(read_start, read_start+read_len-1)
if i % compress_count == 0:
srl.compress()
print(srl.compute_length())
print("Compress Counter: ", compress_count,
" Performance: ", perf_counter() - start_srl)


simple_test()
seed = 127
for reads in [1, 10, 100, 1000, 10000, 100000, 1000000]:
multi_compress_test(seed, reads)
for reads in [1, 10, 100, 1000, 10000, 100000, 1000000]:
stress_test(seed, reads)

0 comments on commit e5c793a

Please sign in to comment.