From 5b55e6b4efc96d5392fb494409af5571d9bd4db5 Mon Sep 17 00:00:00 2001 From: chilampoon Date: Tue, 10 Oct 2023 14:08:01 -0400 Subject: [PATCH 1/4] add query to ref coord mapping --- src/remora/data_chunks.py | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/src/remora/data_chunks.py b/src/remora/data_chunks.py index 4da9918..09b1936 100755 --- a/src/remora/data_chunks.py +++ b/src/remora/data_chunks.py @@ -26,6 +26,7 @@ ) QUERY_OPS = np.array([True, True, False, False, True, False, False, True, True]) REF_OPS = np.array([True, False, True, True, False, False, False, True, True]) +REFCOORD_OPS = np.array([True, False, True, False, False, False, False, True, True]) CIGAR_CODES = ["M", "I", "D", "N", "S", "H", "P", "=", "X"] CODE_TO_OP = { "M": 0, @@ -59,10 +60,16 @@ def map_ref_to_signal(*, query_to_signal, ref_to_query_knots): query_to_signal (np.array): Query to signal coordinate mapping ref_to_query_knots (np.array): Reference to query coordinate mapping """ + query_to_ref_coords = np.interp( + np.arange(query_to_signal.size), + np.arange(ref_to_query_knots.size), + ref_to_query_knots + ).astype(int) + return np.floor( np.interp( ref_to_query_knots, - np.arange(query_to_signal.size), + query_to_ref_coords, query_to_signal, ) ).astype(int) @@ -98,11 +105,28 @@ def make_sequence_coordinate_mapping(cigar): query_knots = np.concatenate( [[0], (query_knots[is_match] - offsets).T.flatten(), [query_knots[-1]]] ) - knots = np.interp(np.arange(ref_knots[-1] + 1), ref_knots, query_knots) + ref_coords = get_ref_coords(ops, lens) + knots = np.interp(np.concatenate([[0], ref_coords]), ref_knots, query_knots) return knots +def get_ref_coords(ops, lens): + ''' + Map query base to reference coordinates (1-based). + Returns + array shape (ref_len+1,); ref_len = len(aln.get_reference_sequences()) + ''' + starts = np.cumsum(np.where(REF_OPS[ops], lens, 0)) - np.where(REF_OPS[ops], lens, 0) + ranges = [ + np.arange(start, start + l) + for op, start, l in zip(ops, starts, lens) + if REFCOORD_OPS[op] + ] + ref_coords = np.concatenate(ranges) + 1 + return ref_coords + + def compute_ref_to_signal(query_to_signal, cigar): ref_to_read_knots = make_sequence_coordinate_mapping(cigar) assert query_to_signal.size == ref_to_read_knots[-1].astype(int) + 1, ( From 3c807bb7dff8a0ec581339f39448d0e2b9227ef6 Mon Sep 17 00:00:00 2001 From: chilampoon Date: Wed, 11 Oct 2023 16:53:18 -0400 Subject: [PATCH 2/4] rm redundant interpolation --- src/remora/data_chunks.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/src/remora/data_chunks.py b/src/remora/data_chunks.py index 09b1936..dce8c35 100755 --- a/src/remora/data_chunks.py +++ b/src/remora/data_chunks.py @@ -60,16 +60,10 @@ def map_ref_to_signal(*, query_to_signal, ref_to_query_knots): query_to_signal (np.array): Query to signal coordinate mapping ref_to_query_knots (np.array): Reference to query coordinate mapping """ - query_to_ref_coords = np.interp( - np.arange(query_to_signal.size), - np.arange(ref_to_query_knots.size), - ref_to_query_knots - ).astype(int) - return np.floor( np.interp( ref_to_query_knots, - query_to_ref_coords, + np.arange(query_to_signal.size), query_to_signal, ) ).astype(int) @@ -115,7 +109,7 @@ def get_ref_coords(ops, lens): ''' Map query base to reference coordinates (1-based). Returns - array shape (ref_len+1,); ref_len = len(aln.get_reference_sequences()) + array shape (ref_len,); ref_len = len(aln.get_reference_sequences()) ''' starts = np.cumsum(np.where(REF_OPS[ops], lens, 0)) - np.where(REF_OPS[ops], lens, 0) ranges = [ From 804fb9b7e82fa5636d0f39abb8e94f99ae7d2b7b Mon Sep 17 00:00:00 2001 From: chilampoon Date: Thu, 12 Oct 2023 11:34:15 -0400 Subject: [PATCH 3/4] change N ref ops to false --- src/remora/data_chunks.py | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) diff --git a/src/remora/data_chunks.py b/src/remora/data_chunks.py index dce8c35..675ebb5 100755 --- a/src/remora/data_chunks.py +++ b/src/remora/data_chunks.py @@ -25,8 +25,7 @@ [True, False, False, False, False, False, False, True, True] ) QUERY_OPS = np.array([True, True, False, False, True, False, False, True, True]) -REF_OPS = np.array([True, False, True, True, False, False, False, True, True]) -REFCOORD_OPS = np.array([True, False, True, False, False, False, False, True, True]) +REF_OPS = np.array([True, False, True, False, False, False, False, True, True]) CIGAR_CODES = ["M", "I", "D", "N", "S", "H", "P", "=", "X"] CODE_TO_OP = { "M": 0, @@ -99,28 +98,11 @@ def make_sequence_coordinate_mapping(cigar): query_knots = np.concatenate( [[0], (query_knots[is_match] - offsets).T.flatten(), [query_knots[-1]]] ) - ref_coords = get_ref_coords(ops, lens) - knots = np.interp(np.concatenate([[0], ref_coords]), ref_knots, query_knots) + knots = np.interp(np.arange(ref_knots[-1] + 1), ref_knots, query_knots) return knots -def get_ref_coords(ops, lens): - ''' - Map query base to reference coordinates (1-based). - Returns - array shape (ref_len,); ref_len = len(aln.get_reference_sequences()) - ''' - starts = np.cumsum(np.where(REF_OPS[ops], lens, 0)) - np.where(REF_OPS[ops], lens, 0) - ranges = [ - np.arange(start, start + l) - for op, start, l in zip(ops, starts, lens) - if REFCOORD_OPS[op] - ] - ref_coords = np.concatenate(ranges) + 1 - return ref_coords - - def compute_ref_to_signal(query_to_signal, cigar): ref_to_read_knots = make_sequence_coordinate_mapping(cigar) assert query_to_signal.size == ref_to_read_knots[-1].astype(int) + 1, ( From a62f46d11b31b424ac41881f0c0bd5c2157f3091 Mon Sep 17 00:00:00 2001 From: chilampoon Date: Sun, 15 Oct 2023 16:44:10 -0400 Subject: [PATCH 4/4] intron aware extract_ref_reg --- src/remora/io.py | 36 ++++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/src/remora/io.py b/src/remora/io.py index b127902..1d326b2 100644 --- a/src/remora/io.py +++ b/src/remora/io.py @@ -492,6 +492,24 @@ def compute_base_space_sig_coords(seq_to_sig_map): np.arange(seq_to_sig_map.size), ) +REF_OPS = np.array([True, False, True, True, False, False, False, True, True]) +REFSEQ_OPS = np.array([True, False, True, False, False, False, False, True, True]) +def get_ref_seq_pos(cigar, ref_start): + ''' + Map query base to reference coordinates (0-based) + Returns + array shape (ref_len,); ref_len = len(aln.get_reference_sequences()) + ''' + ops, lens = map(np.array, zip(*cigar)) + starts = np.cumsum(np.where(REF_OPS[ops], lens, 0)) - np.where(REF_OPS[ops], lens, 0) + ranges = [ + np.arange(start, start + l) + for op, start, l in zip(ops, starts, lens) + if REFSEQ_OPS[op] + ] + ref_coords = ref_start + np.concatenate(ranges) + return ref_coords + @dataclass class ReadRefReg: @@ -1532,6 +1550,8 @@ def add_alignment( ) self.ref_seq = None self.cigar = alignment_record.cigartuples + self.ref_pos = get_ref_seq_pos(self.cigar, self.ref_reg.start) + self.ref_pos = np.concatenate([self.ref_pos, [self.ref_pos[-1]+1]]) if alignment_record.is_reverse: if self.ref_seq is not None: self.ref_seq = util.revcomp(self.ref_seq) @@ -1546,7 +1566,7 @@ def add_alignment( "discordant ref seq lengths: move+cigar:" f"{self.ref_to_signal.size} ref_seq:{len(self.ref_seq)}" ) - self.ref_reg.end = self.ref_reg.start + self.ref_to_signal.size - 1 + self.ref_reg.end = alignment_record.reference_end @classmethod def from_pod5_and_alignment( @@ -1757,17 +1777,21 @@ def extract_ref_reg(self, ref_reg): raise RemoraError( "Cannot extract reference region from unmapped read" ) - if ref_reg.start >= self.ref_reg.start + self.ref_seq_len: + if ref_reg.start >= self.ref_reg.end: raise RemoraError("Reference region starts after read ends") if ref_reg.end < self.ref_reg.start: raise RemoraError("Reference region ends before read starts") + reg_within_read = self.ref_pos[(self.ref_pos >= ref_reg.start) & (self.ref_pos <= ref_reg.end)] + st_idx, = np.where(self.ref_pos == min(reg_within_read)) + en_idx, = np.where(self.ref_pos == max(reg_within_read)) + if self.ref_reg.strand == "+": - reg_st_within_read = max(0, ref_reg.start - self.ref_reg.start) - reg_en_within_read = ref_reg.end - self.ref_reg.start + reg_st_within_read = st_idx[0] + reg_en_within_read = en_idx[0] else: - reg_st_within_read = max(0, self.ref_reg.end - ref_reg.end) - reg_en_within_read = self.ref_reg.end - ref_reg.start + reg_st_within_read = en_idx[0] + reg_en_within_read = st_idx[0] reg_seq_to_sig = self.ref_to_signal[ reg_st_within_read : reg_en_within_read + 1 ].copy()