diff --git a/calculate_coverages.py b/calculate_coverages.py index 4c9b869..44a0c6d 100644 --- a/calculate_coverages.py +++ b/calculate_coverages.py @@ -5,6 +5,7 @@ from os import path import click from glob import glob +from cover import SortedRangeList @click.command() @click.option('-i',"--input", required=True, help="Input: Directory of sam files (files must end in .sam).") @@ -15,7 +16,7 @@ def calculate_coverages(input, output, database): ################################### #Calculate coverage of each contig# ################################### - gotu_dict = defaultdict(set) + gotu_dict = defaultdict(SortedRangeList) file_list = glob(input + "/*.sam") for samfile in file_list: with open(samfile.strip(), 'r') as open_sam_file: @@ -28,8 +29,7 @@ def calculate_coverages(input, output, database): length_string = linesplit[5] length = sum([int(x) for x in re.split("[a-zA-Z]",length_string) if x]) #Add range to contig_dict - for i in range(location,location+length): - gotu_dict[gotu].add(i) + gotu_dict[gotu].add_range(location, location + length - 1) ################################### @@ -43,8 +43,12 @@ def calculate_coverages(input, output, database): #Calculate coverages# ##################### #Make dataframe from dicitonary of coverages of each contig - cov = pd.DataFrame({"gotu":list(gotu_dict.keys()), - "covered_length" : [len(x) for x in gotu_dict.values()] } ) + cov = pd.DataFrame( + { + "gotu": list(gotu_dict.keys()), + "covered_length": [x.compute_length() for x in gotu_dict.values()] + } + ) cov= cov.set_index("gotu") cov = cov.sort_values("covered_length", ascending=False) #Add genome metadata diff --git a/cover.py b/cover.py new file mode 100644 index 0000000..71f328e --- /dev/null +++ b/cover.py @@ -0,0 +1,57 @@ +class SortedRangeList: + def __init__(self, autocompress=100000): + self.ranges = [] + self.autocompress = autocompress + self.since_compressed = 0 + + def add_range(self, start, end): + self.ranges.append((start, end)) + self.since_compressed += 1 + if self.autocompress is not None and \ + self.since_compressed > self.autocompress: + self.compress() + + def compress(self): + if self.since_compressed == 0: + return + # Sort ranges by start index + self.ranges.sort(key=lambda r: r[0]) + + new_ranges = [] + start_val = None + end_val = None + + for r in self.ranges: + if end_val is None: + # case 1: no active range, start active range. + start_val = r[0] + end_val = r[1] + elif end_val >= r[0] - 1: + # case 2: active range continues through this range + # extend active range + end_val = max(end_val, r[1]) + else: # if end_val < r[0] - 1: + # case 3: active range ends before this range begins + # write new range out, then start new active range + new_range = (start_val, end_val) + new_ranges.append(new_range) + start_val = r[0] + end_val = r[1] + + if end_val is not None: + new_range = (start_val, end_val) + new_ranges.append(new_range) + + self.ranges = new_ranges + self.since_compressed = 0 + + def compute_length(self): + if self.since_compressed > 0: + self.compress() + total = 0 + for r in self.ranges: + total += r[1] - r[0] + 1 + return total + + def __str__(self): + return str(self.ranges) diff --git a/cover_test.py b/cover_test.py new file mode 100644 index 0000000..8de0678 --- /dev/null +++ b/cover_test.py @@ -0,0 +1,68 @@ +from cover import SortedRangeList +import random +from time import perf_counter + + +def simple_test(): + srl = SortedRangeList(autocompress=None) + for i in range(9, 0, -2): + srl.add_range(i, i) + srl.add_range(4, 4) + srl.compress() + assert(str(srl) == "[(1, 1), (3, 5), (7, 7), (9, 9)]") + + +def stress_test(seed, num_reads): + intset = set() + srl = SortedRangeList(autocompress=None) + + start_set = perf_counter() + random.seed(seed) + for i in range(num_reads): + read_start = random.randint(0, 10000000) + read_len = random.randint(85, 150) + for j in range(read_start, read_start + read_len): + intset.add(j) + print("SET_ADD: ", perf_counter() - start_set) + + start_srl = perf_counter() + random.seed(seed) + for i in range(num_reads): + read_start = random.randint(0, 10000000) + read_len = random.randint(85, 150) + srl.add_range(read_start, read_start+read_len-1) + print("SRL_ADD: ", perf_counter() - start_srl) + srl.compress() + print("SRL_ADD AND COMPRESS: ", perf_counter() - start_srl) + srl_len = srl.compute_length() + print("SRL_ADD COMPRESS AND LENGTH: ", perf_counter() - start_srl) + + print(srl_len) + print(len(intset)) + assert(len(intset) == srl_len) + print(len(srl.ranges)) + + +def multi_compress_test(seed, num_reads): + srl = SortedRangeList(autocompress=None) + + for compress_count in [1000, 10000, 100000]: + start_srl = perf_counter() + random.seed(seed) + for i in range(num_reads): + read_start = random.randint(0, 10000000) + read_len = random.randint(85, 150) + srl.add_range(read_start, read_start+read_len-1) + if i % compress_count == 0: + srl.compress() + print(srl.compute_length()) + print("Compress Counter: ", compress_count, + " Performance: ", perf_counter() - start_srl) + + +simple_test() +seed = 127 +for reads in [1, 10, 100, 1000, 10000, 100000, 1000000]: + multi_compress_test(seed, reads) +for reads in [1, 10, 100, 1000, 10000, 100000, 1000000]: + stress_test(seed, reads)