From 439731a544dd6f62d6602ee9fbae8813a7e6372d Mon Sep 17 00:00:00 2001 From: Likhitha Surapaneni <10923198+likhitha-surapaneni@users.noreply.github.com> Date: Tue, 20 Aug 2024 11:46:14 +0100 Subject: [PATCH 01/14] Capture uploaded allele correctly for VCF input --- modules/Bio/EnsEMBL/VEP/OutputFactory.pm | 3 ++- modules/Bio/EnsEMBL/VEP/Parser.pm | 2 +- modules/Bio/EnsEMBL/VEP/Parser/VCF.pm | 9 +++++++++ 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/modules/Bio/EnsEMBL/VEP/OutputFactory.pm b/modules/Bio/EnsEMBL/VEP/OutputFactory.pm index 70cd5456e..a5d5da5ee 100755 --- a/modules/Bio/EnsEMBL/VEP/OutputFactory.pm +++ b/modules/Bio/EnsEMBL/VEP/OutputFactory.pm @@ -1296,7 +1296,8 @@ sub VariationFeatureOverlapAllele_to_output_hash { # reference allele $hash->{REF_ALLELE} = $vf->ref_allele_string if $self->{show_ref_allele}; - + + # Capture uploaded allele $hash->{UPLOADED_ALLELE} = ($vf->{original_allele_string} || $vf->{allele_string} || $vf->{class_SO_term} || "" ) if $self->param('uploaded_allele'); # picked? diff --git a/modules/Bio/EnsEMBL/VEP/Parser.pm b/modules/Bio/EnsEMBL/VEP/Parser.pm index 5270770f4..2a5c77f78 100755 --- a/modules/Bio/EnsEMBL/VEP/Parser.pm +++ b/modules/Bio/EnsEMBL/VEP/Parser.pm @@ -976,7 +976,7 @@ sub minimise_alleles { $new_vf->{end} = $end; $new_vf->{seq_region_start} = $start; $new_vf->{seq_region_end} = $end; - $new_vf->{original_allele_string} = $vf->{allele_string}; + $new_vf->{original_allele_string} = $vf->{original_allele_string} || $vf->{allele_string}; $new_vf->{original_start} = $vf->{start}; $new_vf->{original_end} = $vf->{end}; $new_vf->{minimised} = 1; diff --git a/modules/Bio/EnsEMBL/VEP/Parser/VCF.pm b/modules/Bio/EnsEMBL/VEP/Parser/VCF.pm index ee9a46fbe..8f1bd37dc 100644 --- a/modules/Bio/EnsEMBL/VEP/Parser/VCF.pm +++ b/modules/Bio/EnsEMBL/VEP/Parser/VCF.pm @@ -287,6 +287,9 @@ sub create_VariationFeatures { # record original alleles # if they get changed, we need to map from old to new in create_individual_VariationFeatures my @original_alleles = ($ref, @$alts); + use Data::Dumper; + print("Original alleles"."\n"); + print(Dumper(@original_alleles)); # adjust end coord # $end += (length($ref) - 1); @@ -336,10 +339,14 @@ sub create_VariationFeatures { } # create VF object + use Data::Dumper; + # print("VF:".$ref."\n"); + # print("VF:".Dumper(@$alts)."\n"); my $vf = Bio::EnsEMBL::Variation::VariationFeature->new_fast({ start => $start, end => $end, allele_string => $non_variant ? $ref : $ref.'/'.join('/', @$alts), + original_allele_string => $non_variant ? $original_alleles[0] : join("/",@original_alleles), strand => 1, map_weight => 1, adaptor => $self->get_adaptor('variation', 'VariationFeature'), @@ -348,6 +355,8 @@ sub create_VariationFeatures { _line => $record, }); + print($vf->{original_allele_string}); + # flag as non-variant $vf->{non_variant} = 1 if $non_variant; From 11adcb8c0bba93c8abd4623a0b0687aae0c98f00 Mon Sep 17 00:00:00 2001 From: Likhitha Surapaneni <10923198+likhitha-surapaneni@users.noreply.github.com> Date: Tue, 20 Aug 2024 11:48:12 +0100 Subject: [PATCH 02/14] Cleanup --- modules/Bio/EnsEMBL/VEP/Parser/VCF.pm | 7 ------- 1 file changed, 7 deletions(-) diff --git a/modules/Bio/EnsEMBL/VEP/Parser/VCF.pm b/modules/Bio/EnsEMBL/VEP/Parser/VCF.pm index 8f1bd37dc..5706fdbe1 100644 --- a/modules/Bio/EnsEMBL/VEP/Parser/VCF.pm +++ b/modules/Bio/EnsEMBL/VEP/Parser/VCF.pm @@ -287,9 +287,6 @@ sub create_VariationFeatures { # record original alleles # if they get changed, we need to map from old to new in create_individual_VariationFeatures my @original_alleles = ($ref, @$alts); - use Data::Dumper; - print("Original alleles"."\n"); - print(Dumper(@original_alleles)); # adjust end coord # $end += (length($ref) - 1); @@ -339,9 +336,6 @@ sub create_VariationFeatures { } # create VF object - use Data::Dumper; - # print("VF:".$ref."\n"); - # print("VF:".Dumper(@$alts)."\n"); my $vf = Bio::EnsEMBL::Variation::VariationFeature->new_fast({ start => $start, end => $end, @@ -355,7 +349,6 @@ sub create_VariationFeatures { _line => $record, }); - print($vf->{original_allele_string}); # flag as non-variant $vf->{non_variant} = 1 if $non_variant; From 3538ac4fa86d69dc780aa09c9ef1e25a90828cb6 Mon Sep 17 00:00:00 2001 From: Likhitha Surapaneni <10923198+likhitha-surapaneni@users.noreply.github.com> Date: Tue, 20 Aug 2024 11:49:06 +0100 Subject: [PATCH 03/14] Cleanup --- modules/Bio/EnsEMBL/VEP/Parser/VCF.pm | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/Bio/EnsEMBL/VEP/Parser/VCF.pm b/modules/Bio/EnsEMBL/VEP/Parser/VCF.pm index 5706fdbe1..3c8d9277c 100644 --- a/modules/Bio/EnsEMBL/VEP/Parser/VCF.pm +++ b/modules/Bio/EnsEMBL/VEP/Parser/VCF.pm @@ -349,7 +349,6 @@ sub create_VariationFeatures { _line => $record, }); - # flag as non-variant $vf->{non_variant} = 1 if $non_variant; From 5f1d345115fa629c819ac8945cc60eec180b8e02 Mon Sep 17 00:00:00 2001 From: Likhitha Surapaneni <10923198+likhitha-surapaneni@users.noreply.github.com> Date: Tue, 20 Aug 2024 16:52:02 +0100 Subject: [PATCH 04/14] Supporting uploaded allele for multiple alternate alleles --- modules/Bio/EnsEMBL/VEP/InputBuffer.pm | 2 ++ modules/Bio/EnsEMBL/VEP/Parser.pm | 7 ++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/modules/Bio/EnsEMBL/VEP/InputBuffer.pm b/modules/Bio/EnsEMBL/VEP/InputBuffer.pm index 617cc35b0..46458bcd8 100644 --- a/modules/Bio/EnsEMBL/VEP/InputBuffer.pm +++ b/modules/Bio/EnsEMBL/VEP/InputBuffer.pm @@ -209,6 +209,8 @@ sub next { if(my $parser = $self->parser) { while(@$buffer < $buffer_size && (my $vf = $parser->next)) { + ## Set minimal to 1 if indel + $self->{minimal} = 1 if (defined($vf->{minimised}) && $vf->{minimised}); # exit the program if the maximum number of variants not ordered in the input file is reached if (!$self->param('no_check_variants_order') && diff --git a/modules/Bio/EnsEMBL/VEP/Parser.pm b/modules/Bio/EnsEMBL/VEP/Parser.pm index 2a5c77f78..870c12619 100755 --- a/modules/Bio/EnsEMBL/VEP/Parser.pm +++ b/modules/Bio/EnsEMBL/VEP/Parser.pm @@ -950,10 +950,15 @@ sub minimise_alleles { # skip VFs with more than one alt # they get taken care of later by split_variants/rejoin_variants - if(!$vf->{allele_string} || $vf->{allele_string} =~ /.+\/.+\/.+/ || $vf->{allele_string} !~ /.+\/.+/) { + if(!$vf->{allele_string} || $vf->{allele_string} !~ /.+\/.+/) { push @return, $vf; } + elsif($vf->{allele_string} =~ /.+\/.+\/.+/) + { + $vf->{minimised} = 1; + } + else { my @alleles = split('/', $vf->{allele_string}); my $ref = shift @alleles; From 6794d2451f2643be6c4b9de35b45dd6791144b7f Mon Sep 17 00:00:00 2001 From: Likhitha Surapaneni <10923198+likhitha-surapaneni@users.noreply.github.com> Date: Wed, 21 Aug 2024 22:22:52 +0100 Subject: [PATCH 05/14] Return vf after enabling minimised --- modules/Bio/EnsEMBL/VEP/Parser.pm | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/Bio/EnsEMBL/VEP/Parser.pm b/modules/Bio/EnsEMBL/VEP/Parser.pm index 870c12619..142432ac3 100755 --- a/modules/Bio/EnsEMBL/VEP/Parser.pm +++ b/modules/Bio/EnsEMBL/VEP/Parser.pm @@ -957,6 +957,7 @@ sub minimise_alleles { elsif($vf->{allele_string} =~ /.+\/.+\/.+/) { $vf->{minimised} = 1; + push @return, $vf; } else { From c039ac7da92f33498fc6d68583e692cb2bb88a7d Mon Sep 17 00:00:00 2001 From: Likhitha Surapaneni <10923198+likhitha-surapaneni@users.noreply.github.com> Date: Thu, 22 Aug 2024 12:54:35 +0100 Subject: [PATCH 06/14] Clearer variable name and comments --- modules/Bio/EnsEMBL/VEP/Parser.pm | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/Bio/EnsEMBL/VEP/Parser.pm b/modules/Bio/EnsEMBL/VEP/Parser.pm index 142432ac3..8f3c1b038 100755 --- a/modules/Bio/EnsEMBL/VEP/Parser.pm +++ b/modules/Bio/EnsEMBL/VEP/Parser.pm @@ -851,13 +851,13 @@ sub post_process_vfs { $vf->seq_region_start($vf->{start}); $vf->seq_region_end($vf->{end}); - # Checks if the allele string is insertion or/and deletion + # Checks if the allele string is non-minimised insertion or/and deletion my $is_sv = ref($vf) eq 'Bio::EnsEMBL::Variation::StructuralVariationFeature'; if(!$is_sv && defined($vf->{allele_string}) && $vf->{allele_string} =~ /\//){ - my $is_indel = 0; + my $is_non_minimised_indel = 0; my ($ref_allele_string,$alt_allele_string) = split(/\//, $vf->{allele_string}); - $is_indel = 1 unless length($ref_allele_string) == length($alt_allele_string) or $vf->{allele_string} =~ /-/; - $vf = ${$self->minimise_alleles([$vf])}[0] if $is_indel; + $is_non_minimised_indel = 1 unless length($ref_allele_string) == length($alt_allele_string) or $vf->{allele_string} =~ /-/; + $vf = ${$self->minimise_alleles([$vf])}[0] if $is_non_minimised_indel; } } return $vfs; From b7b2013a8550cfc435157cb67b7d681334e744aa Mon Sep 17 00:00:00 2001 From: Likhitha Surapaneni <10923198+likhitha-surapaneni@users.noreply.github.com> Date: Fri, 30 Aug 2024 17:44:02 +0100 Subject: [PATCH 07/14] Renaming conflicting variable names --- modules/Bio/EnsEMBL/VEP/InputBuffer.pm | 3 +++ modules/Bio/EnsEMBL/VEP/Parser.pm | 10 +++++++--- modules/Bio/EnsEMBL/VEP/Parser/VCF.pm | 2 +- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/modules/Bio/EnsEMBL/VEP/InputBuffer.pm b/modules/Bio/EnsEMBL/VEP/InputBuffer.pm index 46458bcd8..1d202ffc8 100644 --- a/modules/Bio/EnsEMBL/VEP/InputBuffer.pm +++ b/modules/Bio/EnsEMBL/VEP/InputBuffer.pm @@ -191,6 +191,9 @@ sub next { while(@$pre_buffer && @$buffer < $buffer_size) { my $vf = $pre_buffer->[0]; + ## Set minimal to 1 if indel + $self->{minimal} = 1 if (defined($vf->{minimised}) && $vf->{minimised}); + # new chromosome if($prev_chr && $vf->{chr} ne $prev_chr) { $self->split_variants() if $self->{minimal}; diff --git a/modules/Bio/EnsEMBL/VEP/Parser.pm b/modules/Bio/EnsEMBL/VEP/Parser.pm index 8f3c1b038..f5156ee29 100755 --- a/modules/Bio/EnsEMBL/VEP/Parser.pm +++ b/modules/Bio/EnsEMBL/VEP/Parser.pm @@ -855,8 +855,11 @@ sub post_process_vfs { my $is_sv = ref($vf) eq 'Bio::EnsEMBL::Variation::StructuralVariationFeature'; if(!$is_sv && defined($vf->{allele_string}) && $vf->{allele_string} =~ /\//){ my $is_non_minimised_indel = 0; - my ($ref_allele_string,$alt_allele_string) = split(/\//, $vf->{allele_string}); - $is_non_minimised_indel = 1 unless length($ref_allele_string) == length($alt_allele_string) or $vf->{allele_string} =~ /-/; + # For VCF input, the allele_string is trimmed to remove anchor base, so we capture original allele + + my $original_allele_string = ($vf->{nontrimmed_allele_string} || $vf->{allele_string}); + my ($ref_allele_string,$alt_allele_string) = split(/\//, $original_allele_string); + $is_non_minimised_indel = 1 unless length($ref_allele_string) == length($alt_allele_string) or $original_allele_string =~ /-/; $vf = ${$self->minimise_alleles([$vf])}[0] if $is_non_minimised_indel; } } @@ -956,6 +959,7 @@ sub minimise_alleles { elsif($vf->{allele_string} =~ /.+\/.+\/.+/) { + # Updating a flag to minimise multi-allelic variants in split_variants/rejoin_variants $vf->{minimised} = 1; push @return, $vf; } @@ -982,7 +986,7 @@ sub minimise_alleles { $new_vf->{end} = $end; $new_vf->{seq_region_start} = $start; $new_vf->{seq_region_end} = $end; - $new_vf->{original_allele_string} = $vf->{original_allele_string} || $vf->{allele_string}; + $new_vf->{original_allele_string} = $vf->{nontrimmed_allele_string} || $vf->{allele_string}; $new_vf->{original_start} = $vf->{start}; $new_vf->{original_end} = $vf->{end}; $new_vf->{minimised} = 1; diff --git a/modules/Bio/EnsEMBL/VEP/Parser/VCF.pm b/modules/Bio/EnsEMBL/VEP/Parser/VCF.pm index 3c8d9277c..8e58bcf71 100644 --- a/modules/Bio/EnsEMBL/VEP/Parser/VCF.pm +++ b/modules/Bio/EnsEMBL/VEP/Parser/VCF.pm @@ -340,7 +340,7 @@ sub create_VariationFeatures { start => $start, end => $end, allele_string => $non_variant ? $ref : $ref.'/'.join('/', @$alts), - original_allele_string => $non_variant ? $original_alleles[0] : join("/",@original_alleles), + nontrimmed_allele_string => $non_variant ? $original_alleles[0] : join("/",@original_alleles), strand => 1, map_weight => 1, adaptor => $self->get_adaptor('variation', 'VariationFeature'), From 8ad71eb1c65006f33931767549fd026658ba1eac Mon Sep 17 00:00:00 2001 From: Likhitha Surapaneni <10923198+likhitha-surapaneni@users.noreply.github.com> Date: Tue, 5 Nov 2024 13:59:17 +0000 Subject: [PATCH 08/14] Fix buffer logic for multi-allelic variants --- modules/Bio/EnsEMBL/VEP/InputBuffer.pm | 8 +------- modules/Bio/EnsEMBL/VEP/Parser.pm | 13 +++++++++++-- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/modules/Bio/EnsEMBL/VEP/InputBuffer.pm b/modules/Bio/EnsEMBL/VEP/InputBuffer.pm index 1d202ffc8..27db09296 100644 --- a/modules/Bio/EnsEMBL/VEP/InputBuffer.pm +++ b/modules/Bio/EnsEMBL/VEP/InputBuffer.pm @@ -191,9 +191,6 @@ sub next { while(@$pre_buffer && @$buffer < $buffer_size) { my $vf = $pre_buffer->[0]; - ## Set minimal to 1 if indel - $self->{minimal} = 1 if (defined($vf->{minimised}) && $vf->{minimised}); - # new chromosome if($prev_chr && $vf->{chr} ne $prev_chr) { $self->split_variants() if $self->{minimal}; @@ -212,8 +209,6 @@ sub next { if(my $parser = $self->parser) { while(@$buffer < $buffer_size && (my $vf = $parser->next)) { - ## Set minimal to 1 if indel - $self->{minimal} = 1 if (defined($vf->{minimised}) && $vf->{minimised}); # exit the program if the maximum number of variants not ordered in the input file is reached if (!$self->param('no_check_variants_order') && @@ -262,8 +257,7 @@ sub next { die($error_msg); } - $self->split_variants() if $self->{minimal}; - + $self->split_variants() if $self->{minimal} ; return $buffer; } diff --git a/modules/Bio/EnsEMBL/VEP/Parser.pm b/modules/Bio/EnsEMBL/VEP/Parser.pm index f5156ee29..31c159e3b 100755 --- a/modules/Bio/EnsEMBL/VEP/Parser.pm +++ b/modules/Bio/EnsEMBL/VEP/Parser.pm @@ -858,8 +858,16 @@ sub post_process_vfs { # For VCF input, the allele_string is trimmed to remove anchor base, so we capture original allele my $original_allele_string = ($vf->{nontrimmed_allele_string} || $vf->{allele_string}); - my ($ref_allele_string,$alt_allele_string) = split(/\//, $original_allele_string); - $is_non_minimised_indel = 1 unless length($ref_allele_string) == length($alt_allele_string) or $original_allele_string =~ /-/; + my @alleles = split(/\//, $original_allele_string); + my $ref_allele_string = shift @alleles; + my $alt_allele_count; + + foreach my $alt(@alleles) { + if (length($ref_allele_string) != length($alt) or $original_allele_string =~ /^-/){ + $is_non_minimised_indel = 1; + last; + } + } $vf = ${$self->minimise_alleles([$vf])}[0] if $is_non_minimised_indel; } } @@ -961,6 +969,7 @@ sub minimise_alleles { { # Updating a flag to minimise multi-allelic variants in split_variants/rejoin_variants $vf->{minimised} = 1; + $vf->{original_allele_string} = $vf->{nontrimmed_allele_string} || $vf->{allele_string}; push @return, $vf; } From d3d655cd55e9fb7157393d6878d0822b01e0b802 Mon Sep 17 00:00:00 2001 From: Likhitha Surapaneni <10923198+likhitha-surapaneni@users.noreply.github.com> Date: Wed, 4 Dec 2024 13:08:49 +0000 Subject: [PATCH 09/14] Modify unittests --- t/Parser_VCF.t | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/t/Parser_VCF.t b/t/Parser_VCF.t index 6fa38488c..cc37d2671 100755 --- a/t/Parser_VCF.t +++ b/t/Parser_VCF.t @@ -72,6 +72,7 @@ is_deeply($vf, bless( { 'variation_name' => 'rs142513484', 'map_weight' => 1, 'allele_string' => 'C/T', + 'nontrimmed_allele_string' => 'C/T', 'end' => 25585733, 'start' => 25585733, 'seq_region_end' => 25585733, @@ -115,6 +116,7 @@ is_deeply($vf, bless( { 'variation_name' => 'test', 'map_weight' => 1, 'allele_string' => 'C/-', + 'minimised' => 1, 'end' => 25587760, 'start' => 25587760, 'seq_region_end' => 25587760, @@ -133,6 +135,7 @@ is_deeply($vf, bless( { 'strand' => 1, 'variation_name' => 'test', 'map_weight' => 1, + 'minimised' => 1, 'allele_string' => '-/C', 'end' => 25587759, 'start' => 25587760, @@ -152,6 +155,7 @@ is_deeply($vf, bless( { 'strand' => 1, 'variation_name' => 'test', 'map_weight' => 1, + 'minimised' => 1, 'allele_string' => 'C/-', 'end' => 25587760, 'start' => 25587760, @@ -171,6 +175,7 @@ is_deeply($vf, bless( { 'strand' => 1, 'variation_name' => 'test', 'map_weight' => 1, + 'minimised' => 1, 'allele_string' => '-/C', 'end' => 25587759, 'start' => 25587760, @@ -191,6 +196,7 @@ is_deeply($vf, bless( { 'variation_name' => 'test', 'map_weight' => 1, 'allele_string' => 'A/C/G', + 'nontrimmed_allele_string' => 'A/C/G', 'end' => 25587759, 'start' => 25587759, 'seq_region_end' => 25587759, @@ -210,6 +216,7 @@ is_deeply($vf, bless( { 'variation_name' => 'test', 'map_weight' => 1, 'allele_string' => 'A/C/GG', + 'original_allele_string' => 'A/C/GG', 'end' => 25587759, 'start' => 25587759, 'seq_region_end' => 25587759, @@ -229,6 +236,7 @@ is_deeply($vf, bless( { 'variation_name' => 'test', 'map_weight' => 1, 'allele_string' => '-/C/T', + 'original_allele_string' => 'A/C/GG', 'end' => 25587759, 'start' => 25587760, 'seq_region_end' => 25587759, @@ -248,6 +256,7 @@ is_deeply($vf, bless( { 'variation_name' => 'test', 'map_weight' => 1, 'allele_string' => 'G/C', + 'nontrimmed_allele_string' => 'G/C', 'end' => 25587759, 'start' => 25587759, 'seq_region_end' => 25587759, @@ -267,6 +276,7 @@ is_deeply($vf, bless( { 'variation_name' => 'test', 'map_weight' => 1, 'allele_string' => 'G', + 'nontrimmed_allele_string' => 'G', 'end' => 25587759, 'start' => 25587759, 'seq_region_end' => 25587759, @@ -294,6 +304,7 @@ is_deeply($vf, bless( { 'variation_name' => 'test', 'map_weight' => 1, 'allele_string' => 'G/C/*', + 'nontrimmed_allele_string' => 'G/C/*', 'end' => 25587759, 'start' => 25587759, 'seq_region_end' => 25587759, @@ -312,6 +323,7 @@ is_deeply($vf, bless( { 'variation_name' => 'test', 'map_weight' => 1, 'allele_string' => 'G/C/', + 'original_allele_string' => 'G/C/', 'end' => 25587759, 'start' => 25587759, 'seq_region_end' => 25587759, @@ -331,6 +343,7 @@ is_deeply($vf, bless( { 'variation_name' => 'test', 'map_weight' => 1, 'allele_string' => 'C/-/*', + 'original_allele_string' => 'GC/G/*', 'end' => 25587760, 'start' => 25587760, 'seq_region_end' => 25587760, @@ -350,6 +363,7 @@ is_deeply($vf, bless( { 'variation_name' => 'test', 'map_weight' => 1, 'allele_string' => '-/C/*', + 'original_allele_string' => 'G/GC/*', 'end' => 25587759, 'start' => 25587760, 'seq_region_end' => 25587759, @@ -393,6 +407,7 @@ is_deeply($vf, bless( { 'variation_name' => 'test', 'map_weight' => 1, 'allele_string' => 'C/T/CAA', + 'original_allele_string' => 'C/T/CAA', 'end' => 25587758, 'start' => 25587758, 'seq_region_end' => 25587758, @@ -557,6 +572,7 @@ is_deeply($snv, bless( { 'chr' => '21', 'strand' => 1, 'variation_name' => 'test', + 'nontrimmed_allele_string' => 'A/C', 'map_weight' => 1, 'allele_string' => 'A/C', 'end' => 25587759, @@ -923,6 +939,7 @@ is_deeply($tandem_RUC, bless( { 'strand' => '1', 'variation_name' => 'tr0', 'allele_string' => 'AGTAAATAGA/' . 'CAT' x 2 . 'GT' x 5 . '/' . 'CA' x 4, + 'original_allele_string' => 'AGTAAATAGA/CATCATGTGTGTGTGT/CACACACA', 'start' => 25587760, 'end' => 25587769, 'seq_region_start' => 25587760, @@ -941,6 +958,7 @@ is_deeply($tandem, bless( { 'strand' => '1', 'variation_name' => 'tr0', 'allele_string' => 'AGTAAATAGA/' . 'CAT' x 2 . 'N' x 5 . '/' . 'CA' x 4, + 'original_allele_string' => 'AGTAAATAGA/CATCATNNNNN/CACACACA', 'start' => 25587760, 'end' => 25587769, 'seq_region_start' => 25587760, @@ -964,6 +982,7 @@ is_deeply($tandem, bless( { 'strand' => '1', 'variation_name' => 'tr0', 'allele_string' => 'A/' . 'CAT' x 2 . 'N' x 5 . '/' . 'CA' x 4, + 'original_allele_string' => 'A/CATCATNNNNN/CACACACA', 'start' => 25587760, 'end' => 25587760, 'seq_region_start' => 25587760, @@ -1014,6 +1033,7 @@ is_deeply($vf, bless( { 'variation_name' => 'test', 'map_weight' => 1, 'allele_string' => 'A/G', + 'nontrimmed_allele_string' => 'A/G', 'end' => 25586000, 'start' => 25586000, 'seq_region_end' => 25586000, @@ -1056,6 +1076,7 @@ is_deeply($vf, bless( { 'variation_name' => 'indtest', 'map_weight' => 1, 'allele_string' => 'A/G', + 'nontrimmed_allele_string' => 'A/G', 'end' => 25587759, 'start' => 25587759, 'seq_region_end' => 25587759, @@ -1073,6 +1094,7 @@ is_deeply($vf, bless( { 'variation_name' => 'indtest', 'map_weight' => 1, 'allele_string' => 'A/G', + 'nontrimmed_allele_string' => 'A/G', 'end' => 25587759, 'start' => 25587759, 'seq_region_end' => 25587759, @@ -1090,6 +1112,7 @@ is_deeply($vf, bless( { 'variation_name' => 'indtest', 'map_weight' => 1, 'allele_string' => 'A', + 'nontrimmed_allele_string' => 'A/G', 'end' => 25587759, 'start' => 25587759, 'seq_region_end' => 25587759, @@ -1120,6 +1143,7 @@ is_deeply($vf, bless( { 'variation_name' => 'indtest', 'map_weight' => 1, 'allele_string' => 'A/A', + 'nontrimmed_allele_string' => 'A/G', 'end' => 25587759, 'start' => 25587759, 'seq_region_end' => 25587759, @@ -1186,6 +1210,7 @@ is_deeply($vf, bless( { 'variation_name' => 'indtest', 'map_weight' => 1, 'allele_string' => 'A/G', + 'nontrimmed_allele_string' => 'A/G', 'end' => 25587759, 'start' => 25587759, 'seq_region_end' => 25587759, From df56c6119056d6875ba9bbdbd62a13151c162eaf Mon Sep 17 00:00:00 2001 From: Likhitha Surapaneni <10923198+likhitha-surapaneni@users.noreply.github.com> Date: Wed, 4 Dec 2024 15:31:30 +0000 Subject: [PATCH 10/14] Changes in rejoin_vf; Fix unit tests --- modules/Bio/EnsEMBL/VEP/InputBuffer.pm | 3 ++- modules/Bio/EnsEMBL/VEP/OutputFactory.pm | 3 ++- t/InputBuffer.t | 10 +++++++++ t/OutputFactory.t | 4 +++- t/OutputFactory_JSON.t | 2 +- t/Parser_Region.t | 2 ++ t/Parser_VCF.t | 26 +++++++++++++++++++----- 7 files changed, 41 insertions(+), 9 deletions(-) diff --git a/modules/Bio/EnsEMBL/VEP/InputBuffer.pm b/modules/Bio/EnsEMBL/VEP/InputBuffer.pm index 27db09296..eb93f00ca 100644 --- a/modules/Bio/EnsEMBL/VEP/InputBuffer.pm +++ b/modules/Bio/EnsEMBL/VEP/InputBuffer.pm @@ -663,7 +663,8 @@ sub split_variants { $first->{original_allele_string} = $original_vf->{allele_string}; $first->{original_start} = $original_vf->{start}; $first->{original_end} = $original_vf->{end}; - $first->{minimised} = 1 + $first->{minimised} = 1; + $first->{first} = 1; } push @tmp, $new_vf; diff --git a/modules/Bio/EnsEMBL/VEP/OutputFactory.pm b/modules/Bio/EnsEMBL/VEP/OutputFactory.pm index a5d5da5ee..38332c179 100755 --- a/modules/Bio/EnsEMBL/VEP/OutputFactory.pm +++ b/modules/Bio/EnsEMBL/VEP/OutputFactory.pm @@ -2194,7 +2194,8 @@ sub rejoin_variants_in_InputBuffer { foreach my $vf(@{$buffer->buffer}) { # reset original one - if(defined($vf->{original_allele_string})) { + # check $vf->{first} to get first $vf in split_variants + if(defined($vf->{first})) { # do consequence stuff $self->get_all_output_hashes_by_VariationFeature($vf); diff --git a/t/InputBuffer.t b/t/InputBuffer.t index 8deb36850..47fba3220 100644 --- a/t/InputBuffer.t +++ b/t/InputBuffer.t @@ -72,6 +72,7 @@ is_deeply($vfs->[0], bless( { 'variation_name' => 'rs142513484', 'map_weight' => 1, 'allele_string' => 'C/T', + 'nontrimmed_allele_string' => 'C/T', 'end' => 25585733, 'start' => 25585733, 'seq_region_end' => 25585733, @@ -90,6 +91,7 @@ is_deeply($vfs->[0], bless( { 'variation_name' => 'rs148490508', 'map_weight' => 1, 'allele_string' => 'A/G', + 'nontrimmed_allele_string' => 'A/G', 'end' => 25592911, 'start' => 25592911, 'seq_region_end' => 25592911, @@ -418,7 +420,9 @@ is_deeply($ib->buffer, [ 'chr' => '1', 'minimised' => 1, 'original_allele_string' => 'CAGAAGAAAG/TAGAAGAAAG/C', + 'nontrimmed_allele_string' => 'CAGAAGAAAG/TAGAAGAAAG/C', 'original_end' => 10, + 'first' => 1, 'end' => 1, 'original_start' => 1, 'strand' => 1, @@ -439,7 +443,10 @@ is_deeply($ib->buffer, [ 'variation_name' => '.', 'alt_allele' => '-', 'map_weight' => 1, + 'minimised' => 1, 'allele_string' => 'AGAAGAAAG/-', + 'original_allele_string' => 'CAGAAGAAAG/TAGAAGAAAG/C', + 'nontrimmed_allele_string' => 'CAGAAGAAAG/TAGAAGAAAG/C', 'start' => 2, 'seq_region_start' => 2, 'seq_region_end' => 10, @@ -470,7 +477,10 @@ is_deeply( 'strand' => 1, 'variation_name' => '.', 'map_weight' => 1, + 'minimised' => 1, 'allele_string' => 'CAG/TAG/T', + 'original_allele_string' => 'CAG/TAG/T', + 'nontrimmed_allele_string' => 'CAG/TAG/T', 'end' => 3, 'start' => 1, 'seq_region_start' => 1, diff --git a/t/OutputFactory.t b/t/OutputFactory.t index 665151aea..31e750ab5 100644 --- a/t/OutputFactory.t +++ b/t/OutputFactory.t @@ -1678,7 +1678,9 @@ $ib = get_annotated_buffer({ is(scalar @{$ib->buffer}, 2, 'minimal - expanded count'); is($ib->buffer->[0]->allele_string, 'C/T', 'minimal - expanded first allele string'); - +# print("Before rejoin\n"); +# use Data::Dumper; +# print(Dumper($ib->buffer)); $of->rejoin_variants_in_InputBuffer($ib); is(scalar @{$ib->buffer}, 1, 'minimal - rejoined count'); diff --git a/t/OutputFactory_JSON.t b/t/OutputFactory_JSON.t index 6ee7efd8a..86ff4871d 100755 --- a/t/OutputFactory_JSON.t +++ b/t/OutputFactory_JSON.t @@ -532,7 +532,7 @@ SKIP: { 'cdna_start' => 2347, 'transcript_id' => 'NM_000484.3', 'gene_id' => '351', - 'uploaded_allele' => '-/A', + 'uploaded_allele' => 'G/GA', 'cds_start' => 2147, 'protein_start' => 716, 'refseq_match' => [ diff --git a/t/Parser_Region.t b/t/Parser_Region.t index 3efcdde05..4085ed712 100644 --- a/t/Parser_Region.t +++ b/t/Parser_Region.t @@ -127,7 +127,9 @@ is_deeply($vf, bless( { 'strand' => '1', 'variation_name' => '21:25587759-25587758:1/A', 'map_weight' => 1, + 'minimised' => 1, 'allele_string' => '-/A', + 'original_allele_string' => '-/A', 'end' => '25587758', 'start' => '25587759', 'seq_region_end' => '25587758', diff --git a/t/Parser_VCF.t b/t/Parser_VCF.t index cc37d2671..33586bfdd 100755 --- a/t/Parser_VCF.t +++ b/t/Parser_VCF.t @@ -215,7 +215,9 @@ is_deeply($vf, bless( { 'strand' => 1, 'variation_name' => 'test', 'map_weight' => 1, + 'minimised' => 1, 'allele_string' => 'A/C/GG', + 'nontrimmed_allele_string' => 'A/C/GG', 'original_allele_string' => 'A/C/GG', 'end' => 25587759, 'start' => 25587759, @@ -223,7 +225,7 @@ is_deeply($vf, bless( { 'seq_region_start' => 25587759 }, 'Bio::EnsEMBL::Variation::VariationFeature' ), 'mixed types - different first base'); -# mixed types - different first base +# mixed types - same first base $vf = Bio::EnsEMBL::VEP::Parser::VCF->new({ config => $cfg, file => $test_cfg->create_input_file([qw(21 25587759 test G GC,GT . . .)]), @@ -235,8 +237,10 @@ is_deeply($vf, bless( { 'strand' => 1, 'variation_name' => 'test', 'map_weight' => 1, + 'minimised' => 1, 'allele_string' => '-/C/T', - 'original_allele_string' => 'A/C/GG', + 'original_allele_string' => 'G/GC/GT', + 'nontrimmed_allele_string' => 'G/GC/GT', 'end' => 25587759, 'start' => 25587760, 'seq_region_end' => 25587759, @@ -322,8 +326,10 @@ is_deeply($vf, bless( { 'strand' => 1, 'variation_name' => 'test', 'map_weight' => 1, + 'minimised' => 1, 'allele_string' => 'G/C/', 'original_allele_string' => 'G/C/', + 'nontrimmed_allele_string' => 'G/C/', 'end' => 25587759, 'start' => 25587759, 'seq_region_end' => 25587759, @@ -342,8 +348,10 @@ is_deeply($vf, bless( { 'strand' => 1, 'variation_name' => 'test', 'map_weight' => 1, + 'minimised' => 1, 'allele_string' => 'C/-/*', 'original_allele_string' => 'GC/G/*', + 'nontrimmed_allele_string' => 'GC/G/*', 'end' => 25587760, 'start' => 25587760, 'seq_region_end' => 25587760, @@ -362,8 +370,10 @@ is_deeply($vf, bless( { 'strand' => 1, 'variation_name' => 'test', 'map_weight' => 1, + 'minimised' => 1, 'allele_string' => '-/C/*', 'original_allele_string' => 'G/GC/*', + 'nontrimmed_allele_string' => 'G/GC/*', 'end' => 25587759, 'start' => 25587760, 'seq_region_end' => 25587759, @@ -382,6 +392,7 @@ is_deeply($vf, bless( { 'chr' => '21', 'minimised' => 1, 'original_allele_string' => 'CAT/CCT', + 'nontrimmed_allele_string' => 'CAT/CCT', 'original_end' => 25587760, 'end' => 25587759, 'seq_region_end' => 25587759, @@ -406,8 +417,10 @@ is_deeply($vf, bless( { 'strand' => 1, 'variation_name' => 'test', 'map_weight' => 1, + 'minimised' => 1, 'allele_string' => 'C/T/CAA', 'original_allele_string' => 'C/T/CAA', + 'nontrimmed_allele_string' => 'C/T/CAA', 'end' => 25587758, 'start' => 25587758, 'seq_region_end' => 25587758, @@ -944,7 +957,8 @@ is_deeply($tandem_RUC, bless( { 'end' => 25587769, 'seq_region_start' => 25587760, 'seq_region_end' => 25587769, - 'map_weight' => 1 + 'map_weight' => 1, + 'minimised' => 1 }, 'Bio::EnsEMBL::Variation::VariationFeature' ) , 'VariationFeature - tandem repeat using RUC'); @@ -963,7 +977,8 @@ is_deeply($tandem, bless( { 'end' => 25587769, 'seq_region_start' => 25587760, 'seq_region_end' => 25587769, - 'map_weight' => 1 + 'map_weight' => 1, + 'minimised' => 1 }, 'Bio::EnsEMBL::Variation::VariationFeature' ) , 'VariationFeature - tandem repeat with missing sequence'); @@ -987,7 +1002,8 @@ is_deeply($tandem, bless( { 'end' => 25587760, 'seq_region_start' => 25587760, 'seq_region_end' => 25587760, - 'map_weight' => 1 + 'map_weight' => 1, + 'minimised' => 1 }, 'Bio::EnsEMBL::Variation::VariationFeature' ) , 'VariationFeature - tandem repeat with missing END and SVLEN'); From 56d965a6dd5bf23205c17ae10639a40434f9d322 Mon Sep 17 00:00:00 2001 From: Likhitha Surapaneni <10923198+likhitha-surapaneni@users.noreply.github.com> Date: Thu, 5 Dec 2024 11:03:07 +0000 Subject: [PATCH 11/14] Fixed unit tests --- t/Parser_Region.t | 2 ++ t/Parser_VCF.t | 16 ++++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/t/Parser_Region.t b/t/Parser_Region.t index 4085ed712..04763f892 100644 --- a/t/Parser_Region.t +++ b/t/Parser_Region.t @@ -132,6 +132,8 @@ is_deeply($vf, bless( { 'original_allele_string' => '-/A', 'end' => '25587758', 'start' => '25587759', + 'original_start' => '25587759', + 'original_end' => '25587758', 'seq_region_end' => '25587758', 'seq_region_start' => '25587759', '_line' => ['21:25587759-25587758:1/A'] diff --git a/t/Parser_VCF.t b/t/Parser_VCF.t index 33586bfdd..36004c0fe 100755 --- a/t/Parser_VCF.t +++ b/t/Parser_VCF.t @@ -116,9 +116,13 @@ is_deeply($vf, bless( { 'variation_name' => 'test', 'map_weight' => 1, 'allele_string' => 'C/-', + 'original_allele_string' => 'AC/A', + 'nontrimmed_allele_string' => 'AC/A', 'minimised' => 1, 'end' => 25587760, 'start' => 25587760, + 'original_end' => 25587760, + 'original_start' => 25587760, 'seq_region_end' => 25587760, 'seq_region_start' => 25587760 }, 'Bio::EnsEMBL::Variation::VariationFeature' ), 'deletion'); @@ -137,8 +141,12 @@ is_deeply($vf, bless( { 'map_weight' => 1, 'minimised' => 1, 'allele_string' => '-/C', + 'original_allele_string' => 'A/AC', + 'nontrimmed_allele_string' => 'A/AC', 'end' => 25587759, 'start' => 25587760, + 'original_start' => 25587760, + 'original_end' => 25587759, 'seq_region_end' => 25587759, 'seq_region_start' => 25587760 }, 'Bio::EnsEMBL::Variation::VariationFeature' ), 'insertion'); @@ -157,8 +165,12 @@ is_deeply($vf, bless( { 'map_weight' => 1, 'minimised' => 1, 'allele_string' => 'C/-', + 'original_allele_string' => 'AC/A', + 'nontrimmed_allele_string' => 'AC/A', 'end' => 25587760, 'start' => 25587760, + 'original_end' => 25587760, + 'original_start' => 25587760, 'seq_region_end' => 25587760, 'seq_region_start' => 25587760 }, 'Bio::EnsEMBL::Variation::VariationFeature' ), 'deletion ignore SVTYPE'); @@ -177,8 +189,12 @@ is_deeply($vf, bless( { 'map_weight' => 1, 'minimised' => 1, 'allele_string' => '-/C', + 'original_allele_string' => 'A/AC', + 'nontrimmed_allele_string' => 'A/AC', 'end' => 25587759, 'start' => 25587760, + 'original_end' => 25587759, + 'original_start' => 25587760, 'seq_region_end' => 25587759, 'seq_region_start' => 25587760 }, 'Bio::EnsEMBL::Variation::VariationFeature' ), 'insertion ignore SVLEN'); From a8fbad34fd7d303e623fe75547ce37501fb736bb Mon Sep 17 00:00:00 2001 From: Likhitha Surapaneni <10923198+likhitha-surapaneni@users.noreply.github.com> Date: Tue, 17 Dec 2024 13:46:00 +0000 Subject: [PATCH 12/14] Modified non_minimised_indel logic; code cleanup --- modules/Bio/EnsEMBL/VEP/Parser.pm | 6 +++++- t/OutputFactory.t | 4 +--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/modules/Bio/EnsEMBL/VEP/Parser.pm b/modules/Bio/EnsEMBL/VEP/Parser.pm index 31c159e3b..af01711d9 100755 --- a/modules/Bio/EnsEMBL/VEP/Parser.pm +++ b/modules/Bio/EnsEMBL/VEP/Parser.pm @@ -863,7 +863,11 @@ sub post_process_vfs { my $alt_allele_count; foreach my $alt(@alleles) { - if (length($ref_allele_string) != length($alt) or $original_allele_string =~ /^-/){ + if ($original_allele_string =~ /-/) + { + last; + } + elsif (length($ref_allele_string) != length($alt)) { $is_non_minimised_indel = 1; last; } diff --git a/t/OutputFactory.t b/t/OutputFactory.t index 31e750ab5..665151aea 100644 --- a/t/OutputFactory.t +++ b/t/OutputFactory.t @@ -1678,9 +1678,7 @@ $ib = get_annotated_buffer({ is(scalar @{$ib->buffer}, 2, 'minimal - expanded count'); is($ib->buffer->[0]->allele_string, 'C/T', 'minimal - expanded first allele string'); -# print("Before rejoin\n"); -# use Data::Dumper; -# print(Dumper($ib->buffer)); + $of->rejoin_variants_in_InputBuffer($ib); is(scalar @{$ib->buffer}, 1, 'minimal - rejoined count'); From 89c3a7d731c39c9e8567b89ab228259b7589768e Mon Sep 17 00:00:00 2001 From: Likhitha Surapaneni <10923198+likhitha-surapaneni@users.noreply.github.com> Date: Wed, 18 Dec 2024 08:55:23 +0000 Subject: [PATCH 13/14] Fix tests --- t/Parser_Region.t | 4 ---- 1 file changed, 4 deletions(-) diff --git a/t/Parser_Region.t b/t/Parser_Region.t index 04763f892..3efcdde05 100644 --- a/t/Parser_Region.t +++ b/t/Parser_Region.t @@ -127,13 +127,9 @@ is_deeply($vf, bless( { 'strand' => '1', 'variation_name' => '21:25587759-25587758:1/A', 'map_weight' => 1, - 'minimised' => 1, 'allele_string' => '-/A', - 'original_allele_string' => '-/A', 'end' => '25587758', 'start' => '25587759', - 'original_start' => '25587759', - 'original_end' => '25587758', 'seq_region_end' => '25587758', 'seq_region_start' => '25587759', '_line' => ['21:25587759-25587758:1/A'] From 93c1ce9cf45695012af023f96b980292390def46 Mon Sep 17 00:00:00 2001 From: Likhitha Surapaneni <10923198+likhitha-surapaneni@users.noreply.github.com> Date: Thu, 30 Jan 2025 09:34:00 +0000 Subject: [PATCH 14/14] Fixed minimisation for default input --- modules/Bio/EnsEMBL/VEP/InputBuffer.pm | 6 ++-- t/AnnotationSource_Cache_Variation.t | 21 ++++-------- t/AnnotationSource_Cache_VariationTabix.t | 39 ++++++++--------------- t/InputBuffer.t | 10 ++++-- 4 files changed, 31 insertions(+), 45 deletions(-) diff --git a/modules/Bio/EnsEMBL/VEP/InputBuffer.pm b/modules/Bio/EnsEMBL/VEP/InputBuffer.pm index eb93f00ca..64d5ba767 100644 --- a/modules/Bio/EnsEMBL/VEP/InputBuffer.pm +++ b/modules/Bio/EnsEMBL/VEP/InputBuffer.pm @@ -193,6 +193,7 @@ sub next { # new chromosome if($prev_chr && $vf->{chr} ne $prev_chr) { + $self->{minimal}=1 if $vf->{minimised}; $self->split_variants() if $self->{minimal}; return $buffer; } @@ -224,7 +225,7 @@ sub next { # we can't push the VF back onto the parser, so add it to $pre_buffer # and it will get picked up on the following next() call push @$pre_buffer, $vf; - + $self->{minimal}=1 if $vf->{minimised}; $self->split_variants() if $self->{minimal}; $prev_start = 0; return $buffer; @@ -245,6 +246,7 @@ sub next { } } $prev_start = $vf->{start}; + $self->{minimal}=1 if $vf->{minimised}; } } } @@ -257,7 +259,7 @@ sub next { die($error_msg); } - $self->split_variants() if $self->{minimal} ; + $self->split_variants() if $self->{minimal}; return $buffer; } diff --git a/t/AnnotationSource_Cache_Variation.t b/t/AnnotationSource_Cache_Variation.t index a9bd639cf..ed5a32122 100644 --- a/t/AnnotationSource_Cache_Variation.t +++ b/t/AnnotationSource_Cache_Variation.t @@ -414,11 +414,11 @@ is_deeply( $ib = get_ib([qw(21 8987004 . TA C,TAGCG . . .)]); $c->annotate_InputBuffer($ib); is_deeply( - $ib->buffer->[0]->{existing}->[0]->{matched_alleles}, + $ib->buffer->[1]->{existing}->[0]->{matched_alleles}, [ { - 'a_index' => 1, - 'a_allele' => 'TAGCG', + 'a_index' => 0, + 'a_allele' => 'GCG', 'b_allele' => 'GCG', 'b_index' => 0 } @@ -444,24 +444,15 @@ is_deeply( $ib = get_ib([qw(21 8987004 . TAT TAGCGT,TAGTGT . . .)]); $c->annotate_InputBuffer($ib); is_deeply( - $ib->buffer->[0]->{existing}->[0]->{matched_alleles}, + $ib->buffer->[1]->{existing}->[0]->{matched_alleles}, [ { 'a_index' => 0, - 'a_allele' => 'AGCGT', - 'b_allele' => 'GCG', - 'b_index' => 0 - }, - { - 'a_index' => 1, - 'a_allele' => 'AGTGT', + 'a_allele' => 'GTG', 'b_allele' => 'GTG', 'b_index' => 1 } - ], - 'nastiness 4' -); - + ],'nastiness 4' ); # test old_maf setting $p = Bio::EnsEMBL::VEP::Parser::VCF->new({config => $cfg, file => $test_cfg->{test_vcf}, valid_chromosomes => [21]}); diff --git a/t/AnnotationSource_Cache_VariationTabix.t b/t/AnnotationSource_Cache_VariationTabix.t index 8c04263d7..16f10f16f 100644 --- a/t/AnnotationSource_Cache_VariationTabix.t +++ b/t/AnnotationSource_Cache_VariationTabix.t @@ -445,11 +445,11 @@ SKIP: { $ib = get_ib([qw(21 8987004 . TA C,TAGCG . . .)]); $c->annotate_InputBuffer($ib); is_deeply( - $ib->buffer->[0]->{existing}->[0]->{matched_alleles}, + $ib->buffer->[1]->{existing}->[0]->{matched_alleles}, [ { - 'a_index' => 1, - 'a_allele' => 'TAGCG', + 'a_index' => 0, + 'a_allele' => 'GCG', 'b_allele' => 'GCG', 'b_index' => 0 } @@ -475,17 +475,11 @@ SKIP: { $ib = get_ib([qw(21 8987004 . TAT TAGCGT,TAGTGT . . .)]); $c->annotate_InputBuffer($ib); is_deeply( - $ib->buffer->[0]->{existing}->[0]->{matched_alleles}, + $ib->buffer->[1]->{existing}->[0]->{matched_alleles}, [ { 'a_index' => 0, - 'a_allele' => 'AGCGT', - 'b_allele' => 'GCG', - 'b_index' => 0 - }, - { - 'a_index' => 1, - 'a_allele' => 'AGTGT', + 'a_allele' => 'GTG', 'b_allele' => 'GTG', 'b_index' => 1 } @@ -494,16 +488,17 @@ SKIP: { ); # Test frequency match for 'matched alleles' - my $ib_freq = get_ib([qw(21 25005812 . CA CAAA,CAAA . . .)]); + my $ib_freq = get_ib([qw(21 25005811 . CA CAAA,CAAA . . .)]); $c->{freq_pop} = '1KG_AMR'; $c->annotate_InputBuffer($ib_freq); $vf = $ib_freq->buffer->[0]; $c->get_frequency_data($vf); + is_deeply( $vf->{_freq_check_freqs}, { '1KG_AMR' => { - 'AAA' => '0.4107' + 'AA' => '0.4107' }, }, 'get_frequency_data - matched alleles' @@ -606,11 +601,11 @@ SKIP: { $ib = get_ib([qw(21 8987004 . TA C,TAGCG . . .)]); $c->annotate_InputBuffer($ib); is_deeply( - $ib->buffer->[0]->{existing}->[0]->{matched_alleles}, + $ib->buffer->[1]->{existing}->[0]->{matched_alleles}, [ { - 'a_index' => 1, - 'a_allele' => 'TAGCG', + 'a_index' => 0, + 'a_allele' => 'GCG', 'b_allele' => 'GCG', 'b_index' => 0 } @@ -635,18 +630,12 @@ SKIP: { $ib = get_ib([qw(21 8987004 . TAT TAGCGT,TAGTGT . . .)]); $c->annotate_InputBuffer($ib); - is_deeply( - $ib->buffer->[0]->{existing}->[0]->{matched_alleles}, + is_deeply( + $ib->buffer->[1]->{existing}->[0]->{matched_alleles}, [ { 'a_index' => 0, - 'a_allele' => 'AGCGT', - 'b_allele' => 'GCG', - 'b_index' => 0 - }, - { - 'a_index' => 1, - 'a_allele' => 'AGTGT', + 'a_allele' => 'GTG', 'b_allele' => 'GTG', 'b_index' => 1 } diff --git a/t/InputBuffer.t b/t/InputBuffer.t index 47fba3220..4d5c1a9d4 100644 --- a/t/InputBuffer.t +++ b/t/InputBuffer.t @@ -478,13 +478,17 @@ is_deeply( 'variation_name' => '.', 'map_weight' => 1, 'minimised' => 1, - 'allele_string' => 'CAG/TAG/T', + 'allele_string' => 'C/T', 'original_allele_string' => 'CAG/TAG/T', 'nontrimmed_allele_string' => 'CAG/TAG/T', - 'end' => 3, + 'alt_allele' => 'T', + 'end' => 1, 'start' => 1, 'seq_region_start' => 1, - 'seq_region_end' => 3, + 'seq_region_end' => 1, + 'original_start' => 1, + 'original_end' => 3, + 'first' => 1 }, 'Bio::EnsEMBL::Variation::VariationFeature' ), 'minimal - doesnt affect non-minimisable' );