-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathSNPtable2classifyhyballeles.pl
145 lines (143 loc) · 3.74 KB
/
SNPtable2classifyhyballeles.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#!/bin/perl
use warnings;
use strict;
use List::Util 'shuffle';
#This script divides sites into three categories.
#-Variable in the parents (hybrids have no new alelles)
#-Fixed differences in the parents
#-f
NOT DONE!
#Requires parents to have 5 individuals sampled.
my $samplefile = $ARGV[0];
my %printlist;
my %specieslist;
open SAMPLEFILE, $samplefile;
while(<SAMPLEFILE>){
chomp;
my @a = split(/\t/,$_);
$printlist{$a[0]}++;
$specieslist{$a[0]} = $a[1]; #should have P1, P2 and any other name for hybrids
}
close SAMPLEFILE;
my $counter;
my %samplelist;
my @good_number_list;
my %good_number_hash;
my %species;
my $max_count = 20;
my @hybrid_species= qw(anomalus deserticola paradoxus);
while(<STDIN>){
$counter++;
chomp;
my $line = "$_";
my @a = split(/\t/,$line);
if ($. == 1){
foreach my $i (2..$#a){
$samplelist{$i} = $a[$i];
if ($specieslist{$a[$i]}){
$species{$i} = $specieslist{$a[$i]};
push(@good_number_list, $i);
$good_number_hash{$i}++;
}
}
print "chrom\tpos\tvalue";
next;
}
unless ($a[3]){
next;
}
my $pos = $a[1];
my $chrom = $a[0];
my %P1alleles;
my %P2alleles;
my $P1count = 0;
my $P2count = 0;
my %species_count;
if (($counter % 100000)== 0){
print STDERR "Processing $chrom $pos...\n";
}
foreach my $i (keys %good_number_hash){ #Load up parental alleles
if ($a[$i] ne "NN"){
if ($species{$i} eq "P1"){
$P1count++;
}elsif($species{$i} eq "P2"){
$P2count++;
}else{
$species_count{$species{$i}}++;
}
}
}
unless(($P1count >=5) and ($P2count >= 5)){
next;
}
foreach my $key (@hybrid_species){ #Make sure each hybrid species has atleast two genotypes
unless($species_count{$key}){
goto SKIP;
}elsif($species_count{$key} < 2){
goto SKIP;
}
}
my $min_count; #Samples are subset down to the lowest sample size of either parent
if ($P1count < $P2count){
$min_count = $P1count;
}else{
$min_count = $P2count;
}
if ($min_count > $max_count){
$min_count = $max_count;
}
my $P1count2 = 0;
my $P2count2 = 0;
my %parental_alleles;
foreach my $j ( shuffle keys %good_number_hash){ #Load up parental alleles
if ($a[$j] ne "NN"){
if ((($species{$j} eq "P1") and ($P1count2 < $min_count)) or (($species{$j} eq "P2") and ($P2count2 < $min_count))){
if ($species{$j} eq "P1"){
$P1count2++;
}else{
$P2count2++;
}
my @parentalbases = split(//,$a[$j]);
$parental_alleles{$parentalbases[0]}++;
$parental_alleles{$parentalbases[1]}++;
}
}
}
#
my %hybrid_alleles;
my %hybrid_count;
foreach my $i ( shuffle keys %good_number_hash){
if ($a[$i] ne "NN"){
if (($species{$i} ne "P1") and ($species{$i} ne "P2")){
if ($hybrid_count{$species{$i}}){ #This only selects two per hybrid species.
if($hybrid_count{$species{$i}} == 2){ next;}
}
$hybrid_count{$species{$i}}++;
my @bases = split(//,$a[$i]);
foreach my $n(0..1){
unless ($parental_alleles{$bases[$n]}){
$hybrid_alleles{$bases[$n]}{$species{$i}}++;
}
}
}
}
}
unless(%hybrid_alleles){ #Go to next line if there are no new alleles in any of the hybrids
next;
}
my $printvalue;
foreach my $base (sort keys %hybrid_alleles){
if ($hybrid_alleles{$base}{"anomalus"}){
$printvalue .= 1;
}
if ($hybrid_alleles{$base}{"deserticola"}){
$printvalue .= 2;
}
if ($hybrid_alleles{$base}{"paradoxus"}){
$printvalue .=3;
}
print "\n$chrom\t$pos\t$printvalue";
undef($printvalue);
}
SKIP:
}