-
Notifications
You must be signed in to change notification settings - Fork 2
03 Library Documentation
This page will document the (perl) libraries of kmasker.
#filehandler.pm Filehanler.pm contains all funtions with reference to operations on sequence and occ files. For example opening and parsing of files. In the following each function will be listed and explained. Filehandler uses File::Basename. ##read_sequence ###Description read_sequence reads in a fasta file. It will read in one sequence entry (e.g. chromosome or read) into memory to work with it. The code was taken from: http://code.izzid.com/2011/10/31/How-to-read-a-fasta-file-in-perl.html ###Input The function expects an open file handle and a hash which will be used for return values like header and sequence. ###Output The functions uses the hash for return values and can be used in while loops as condition. ###Example
#open file handle
open(my $fasta, "<", "your_file") or die "Can not open your_file\n";
#hash for return values
my %seqdata;
#loop for accessing the sequence data
while(read_sequence($fasta, \%seqdata)) {
my @sequence = split '', $seqdata{seq}; #splits input sequence into array
my $header = seqdata{header}; #getting header of the fasta sequence
### do what you want here ###
}
close($fasta);
###Code
sub read_sequence { #works for fasta
my ($fh, $seq_info) = @_;
$seq_info->{seq} = undef; # clear out previous sequence
# put the header into place
$seq_info->{header} = $seq_info->{next_header} if $seq_info->{next_header};
my $file_not_empty = 0;
while (<$fh>) {
$file_not_empty = 1;
next if /^\s*$/; # skip blank lines
chomp;
if (/^>/) { # fasta header line
my $h = $_;
$h =~ s/^>//;
if ($seq_info->{header}) {
$seq_info->{next_header} = $h;
return $seq_info;
}
else { # first time through only
$seq_info->{header} = $h;
}
}
else {
s/\s+//g; # remove any white space
$seq_info->{seq} .= $_;
}
}
if ($file_not_empty) {
return $seq_info;
}
else {
# clean everything up
$seq_info->{header} = $seq_info->{seq} = $seq_info->{next_header} = undef;
return;
}
}
##read_occ ###Description read_occ reads in an occ file. It will read in one sequence entry (e.g. chromosome or read) into memory to work with it. The code was taken from: http://code.izzid.com/2011/10/31/How-to-read-a-fasta-file-in-perl.html It's almost the same code as in read_sequence, but it will not remove the whitespaces between the numerical entries in the occ. Futhermore it adds an space when concatenating two lines. ###Input The function expects an open file handle and a hash which will be used for return values like header and sequence. ###Output The functions uses the hash for return values and can be used in while loops as condition. ###Example
#open file handle
open(my $occ, "<", "your_file") or die "Can not open your_file\n";
#hash for return values
my %occdata;
#loop for accessing the sequence data
while(read_sequence($occ, \%occdata)) {
my @occvalues = split /\s+/, $occdata{seq}; #splits input occ values into array
my $header = occdata{header}; #getting header of the occ sequence
### do what you want here ###
}
close($occ);
###Code
sub read_occ { #works for occ
my ($fh, $seq_info) = @_;
$seq_info->{seq} = undef; # clear out previous sequence
# put the header into place
$seq_info->{header} = $seq_info->{next_header} if $seq_info->{next_header};
my $file_not_empty = 0;
while (<$fh>) {
$file_not_empty = 1;
next if /^\s*$/; # skip blank lines
chomp;
if (/^>/) { # fasta header line
my $h = $_;
$h =~ s/^>//;
if ($seq_info->{header}) {
$seq_info->{next_header} = $h;
return $seq_info;
}
else { # first time through only
$seq_info->{header} = $h;
}
}
else {
#s/\s+//g; # remove any white space # We do not want this, because we are using split with whitespaces
if($seq_info->{seq}) {
$seq_info->{seq} .= " " .$_;
}
else {
$seq_info->{seq} = $_;
}
}
}
if ($file_not_empty) {
return $seq_info;
}
else {
# clean everything up
$seq_info->{header} = $seq_info->{seq} = $seq_info->{next_header} = undef;
return;
}
}
##sequence_length ###Description This will print the length of each sequence in a fasta file into an tab separated file. ###Input The function expects a filename of a fasta file. ###Output The function will create a prefix.length file in the path of the fasta file. ###Example get_length.pl
#!/usr/bin/env perl
use strict;
use warnings;
use Getopt::Long;
use kmasker::filehandler;
my $fasta;
GetOptions ("fasta=s" => \$fasta) # string
or die("Error in command line arguments\n");
sequence_length($fasta);
###Code
sub sequence_length {
my $seqfile = $_[0]; #just the filename of the sequence file
open(my $seqfh, "<", "$seqfile") or die "Can not open $seqfile\n";
(my $name,my $path,my $suffix) = fileparse($seqfile, qr/\.[^.]*/);
open(my $seql, ">", "$path/$name$suffix.length") or die "Can not write to $path/$name$suffix.length \n";
my %seqdata;
while (read_sequence($seqfh, \%seqdata)) {
#print $seqdata{header} . " : " . length($seqdata{seq}) . "\n";
print $seql $seqdata{header} . "\t" . length($seqdata{seq}) . "\n";
}
close($seqfh);
close($seql);
}
##occ_length ###Description This will print the length of each sequence in an occ file into an tab separated file. ###Input The function expects a filename of an occ file. ###Output The function will create a prefix.length file in the path of the occ file. ###Example get_occ_length.pl
#!/usr/bin/env perl
use strict;
use warnings;
use Getopt::Long;
use kmasker::filehandler;
my $occ;
GetOptions ("occ=s" => \$occ) # string
or die("Error in command line arguments\n");
occ_length($occ);
###Code
sub occ_length {
my $seqfile = $_[0]; #just the filename of the sequence file
open(my $seqfh, "<", "$seqfile") or die "Can not open $seqfile\n";
(my $name,my $path,my $suffix) = fileparse($seqfile, qr/\.[^.]*/);
open(my $seql, ">", "$path/$name$suffix.length") or die "Can not write to $path/$name$suffix.length \n";
my %seqdata;
while (read_occ($seqfh, \%seqdata)) {
#print $seqdata{header} . " : " . length($seqdata{seq}) . "\n";
my @occvalues = split /\s+/, $seqdata{seq};
print $seql $seqdata{header} . "\t" . scalar(@occvalues) . "\n";
}
close($seqfh);
close($seql);
}
#occ.pm occ.pm contains all functions which operate with .occ files, execpt of operations which belong to file handling. It uses File::Basename, kmasker::filehandler and POSIX qw/ceil/. ##normalize_occ ###Description This function will create a normalized version of the input occ file. This means a values will be divided by a fixed value (your sequencing depth). ###Input The function expects a filename of an occ file and the sequencing depth. ###Output The function will create a prefix_normalized.occ file in the path of the occ file. ###Example norm.pl
#!/usr/bin/env perl
use strict;
use warnings;
use Getopt::Long;
use kmasker::occ;
my $occ;
my $depth;
GetOptions ("occ=s" => \$occ,
"depth=i" => \$depth)
or die("Error in command line arguments\n");
normalize_occ($occ, $depth)
###Code
sub normalize_occ{
my $file = $_[0];
my $depth = $_[1];
my %seqdata;
(my $name,my $path,my $suffix) = fileparse($file, qr/\.[^.]*/);
open(my $occ, "<", "$file") or die "Can not open $file\n";
open(my $occ_norm, ">", $path.$name."_normalized.occ") or die "Can not write to " . $path.$name."_normalized.occ\n";
if ($depth == 1) {
print "Nothing to do! Depth was 1. \n";
return;
}
else {
while (read_occ($occ, \%seqdata)) {
print $occ_norm ">".$seqdata{header}."\n";
my @values = split /\s+/ , $seqdata{seq};
foreach(@values) {
#print $_ . ": $depth = " . ceil($_/$depth) . "\n" ;
$_ = ceil($_/$depth);
}
#print $occ_norm "@values\n";
my $whitespace = 0;
for (my $i = 0; $i < scalar(@values); $i++) {
if($whitespace) {
print $occ_norm " " . $values[$i];
}
else{
print $occ_norm $values[$i];
$whitespace = 1;
}
if((($i+1) % 25 == 0) && ($i+1 != scalar(@values))) {
print $occ_norm "\n";
$whitespace = 0;
}
}
print $occ_norm "\n";
}
}
close($occ);
close($occ_norm);
}
##apply_occ ###Description This function will use an occ file to mask a fasta file. ###Input The function expects a filename of a fasta file, a filename of an occ file and the repeat threshold (repT). ###Output The function will create a freakmasked_RT(Value).prefix.fasta file in the path of the fasta file. ###Example masker.pl
#!/usr/bin/env perl
use strict;
use warnings;
use Getopt::Long;
use kmasker::occ;
my $occ;
my $repT;
my $fasta;
GetOptions ("occ=s" => \$occ,
"repT=i" => \$repT,
"fasta=s" => \$fasta)
or die("Error in command line arguments\n");
apply_occ($fasta, $occ, $repT)
###Code
sub apply_occ{
my $fasta_file = $_[0];
my $occ_file = $_[1];
my $rept = $_[2];
my %seqdata;
my %occ_data;
open(my $occ, "<", "$occ_file") or die "Can not open $occ_file\n";
open(my $fasta, "<", "$fasta_file") or die "Can not open $fasta_file\n";
(my $name,my $path,my $suffix) = fileparse($fasta_file, qr/\.[^.]*/);
open(my $freakmaskedFASTA, ">", "$path/freakmasked_RT$rept.$name$suffix") or die "Can not write to " . "$path/freakmasked_RT$rept.$name$suffix\n" ;
while(read_sequence($fasta, \%seqdata)) {
read_occ($occ, \%occ_data);
my @sequence = split '', $seqdata{seq};
my @occvalues = split /\s+/, $occ_data{seq};
#print $seqdata{header} . " " . $occ_data{header} . "\n";
#print length($seqdata{seq}) . " " . length($occ_data{seq}) . "\n";
#for (my $k = 0; $k < scalar(@sequence); $k++) {
# print $occvalues[$k];
# print " ";
# print $sequence[$k];
# print "\n";
#}
if($seqdata{header} ne $occ_data{header}) {
print "Warning: Headers in occ and fasta are different! " . $seqdata{header} . " != " . $occ_data{header} . "\n";
}
else {
print "Working on: " . $seqdata{header} . "\n";
}
if(scalar(@sequence) != scalar(@occvalues)) {
die "Sorry your occ input has an different length than the fasta file !\n " . scalar(@sequence) . "!=" . scalar(@occvalues) . "\n";
}
for (my $i = 0; $i < scalar(@sequence); $i++) {
if($occvalues[$i] > $rept) {
#we will mask the sequnce in this part
$sequence[$i] = "X";
}
}
print $freakmaskedFASTA ">" .$seqdata{header}."\n";
foreach(@sequence){
print $freakmaskedFASTA $_;
}
print $freakmaskedFASTA "\n";
}
close($occ);
close($fasta);
}
###apply_occ_reverse (untested) ####Description This will do the same operations like apply_occ, but the output will be reverse. This means instead of masking repetitive regions, non-repetitive regions will be masked. Because of the similarity to apply_occ no further documentation is provided. ####Input like apply_occ ####Output freakmaskedRT(Value)_reverse.prefix.fasta
##merge_occ ###Description This merges two occ files by summing their values. ###Input The first two arguments of the function are the paths of the occ files to merge. ###Output The third argument is the ouput occ file. ###Example
#!/usr/bin/env perl
use strict;
use warnings;
use Getopt::Long;
use kmasker::occ;
my $occ1;
my $occ2;
my $occout;
GetOptions ("occ1=s" => \$occ1,
"occ2=s" => \$occ2,
"occout=s" => \$occout)
or die("Error in command line arguments\n");
merge_occ($occ1, $occ2, $occout);
###Code
sub merge_occ {
my $first_occ = $_[0];
my $second_occ = $_[1];
my $out_occ = $_[2];
my %f_occdata;
my %s_occdata;
open(my $focc_stream, "<", "${first_occ}") or die "Can not open $first_occ\n";
open(my $socc_stream, "<", "${second_occ}") or die "Can not open $second_occ\n";
open(my $merged_occ, ">", "${out_occ}");
while(read_occ($focc_stream, \%f_occdata)) {
read_occ($socc_stream, \%s_occdata);
my @focc_values = split /\s+/, $f_occdata{seq};
my @socc_values = split /\s+/, $s_occdata{seq};
if($f_occdata{header} ne $s_occdata{header}) {
print "Warning: Headers in occ and fasta are different! " . $s_occdata{header} . " != " . $f_occdata{header} . "\n";
}
if(scalar(@focc_values) != scalar(@socc_values)) {
die "Can not merge! The files differ in length.\n " . scalar(@socc_values) . "!=" . scalar(@focc_values) . "\n";
}
print $merged_occ $f_occdata{header} . "\n";
my @out_values;
for (my $i = 0; $i < scalar(@focc_values); $i++) {
$out_values[$i] = $focc_values[$i] + $socc_values[$i];
}
my $whitespace = 0;
for (my $i = 0; $i < scalar(@out_values); $i++) {
if($whitespace) {
print $merged_occ " " . $out_values[$i];
}
else{
print $merged_occ $out_values[$i];
$whitespace = 1;
}
if((($i+1) % 25 == 0) && ($i+1 != scalar(@out_values))) {
print $merged_occ "\n";
$whitespace = 0;
}
}
print $merged_occ "\n";
}
}