bin/distribute_fasta_by_blast_taxid

#!/usr/bin/env perl

# A program to split up reads based on taxid of best blast hit(s)
#
# Idea is based on Sharon Chao Wooton's distributeReads.pl script
#
# Takes 2 files as input:
# (1) a fasta file used as a blast input
# (2) the output of the blast (-m8 format)
#
# Program flow:
# (1) parse blast output, creating a map of blast query id -> taxid of best hit
#     and a map of taxid->description
#
# (2) parse fasta file, for each record with a hit, output it to a new file for the
#     taxid of the best hit
#
#  ANOTHER OPTION - NOT IMPLEMENTED
# (2) parse fasta file, distributing original fasta reads into a file with a 
#     modified header.  The new header contains the original header appended with
#     the TAXID of the best hit, description of species name, and e-value  of the hit
#
#     BLAST hit ties are ignored.  Only the first best hit is counted.
# 
# Dependencies: these perl modules I've written:
# to interface to the NCBI taxonomy data
#
#    acc_to_taxid.pm 
#    taxid_to_name.pm
#    taxid_to_kingdom.pm
#
# Mark Stenglein August 13, 2011
# Updated: 4/17/2014
#


use strict;
use warnings;
use Getopt::Long;

# these lines allow modules in the pwd to be used
# see: https://stackoverflow.com/questions/4703682/does-perl-look-in-the-current-directory-for-modules
use Cwd 'abs_path';
use File::Basename;
use lib dirname( abs_path $0 );

# custom modules, the .pm files should be in pwd
use acc_to_taxid;
use taxid_to_name;
use taxid_to_kingdom;
use taxid_info;

my $print_usage = 0;
my $virus_only = 0;
my $max_evalue = undef;
my $min_tally = 0;
my $taxid_file = undef;
my @taxids_to_filter_array = ();
my %taxids_to_filter = ();
my %higher_level_taxids = ();

# the location of sqlite database containing NCBI taxonomy info
# created with download_and_process_taxonomy_databases script 
# in this repository:
my $ncbi_tax_db = undef;

my $usage = <<USAGE;

  This script will output records from a fasta file into subset files based on blast-alignment-based taxonomic assignment.  
  It creates one file per taxon.

  USAGE: $0 [-h] 
            [-c min_tally] 
            [-e max_evalue] 
            [-v] 
            [-ntd ncbi_tax_db]
            [[-t taxid1 ... ] OR [-f taxid_file]] <fasta_file> <blast_output_file> 


  -t t1 -t t2 ...   only output records for these taxids
                    Taxid can be a species or a higher-level node (e.g. 10239 = all viruses).
                    
  -f <taxids_file>  only output records for taxids listed in this file (1 per line)

  -c <min_tally>    only output if number hits for this taxid is above this cutoff

  -e <max_evalue>   only output a fasta record if the corresponding hit's evalue is below this cutoff

  -v                only output virus taxids (default = no).  Equivalent to running $0 -t 10239

  -ntd ncbi_tax_db  the path of a SQLite database file containing information from the NCBI Taxonomy database
  
  -h                print this message

USAGE

if (scalar @ARGV == 0) { print $usage and exit; }

GetOptions ("h" => \$print_usage, 
            "v" => \$virus_only, 
            "f=s" => \$taxid_file, 
            "c=s" => \$min_tally, 
            "e=s" => \$max_evalue, 
            "ntd=s" => \$ncbi_tax_db,
            "t=s" => \@taxids_to_filter_array);

if ($print_usage) { print $usage; exit; }

my $taxid_fh = undef;
if ($taxid_file)
{
   open ($taxid_fh, "<", $taxid_file) or print $usage and die("error: couldn't open TAXID file $taxid_file\n$!\n");
}

if ($taxid_fh)
{
   # parse through taxids to filter file 
   # print "parsing TAXID file\n";
   while (<$taxid_fh>)
   {
      chomp;
      # ignore commented out lines
      if (!/^\s*#/)
      {
         my @fields = split "\t";
         my $taxid_to_filter = $fields[0];
         $taxids_to_filter{$taxid_to_filter} += 1;
      }
   }
}

if (@taxids_to_filter_array)
{
   foreach my $t (@taxids_to_filter_array)
   {
      $taxids_to_filter{$t} = 1;
   }
}

# are we only outputing a subset of TAXIDS?
my $output_taxid_subset =  keys %taxids_to_filter;

# are any of the taxids upper level? (not species/leaf level)
foreach my $t (keys %taxids_to_filter)
{
   # returns an array ref
   my $child_nodes = taxid_info::taxid_descendents($ncbi_tax_db, $t);
   if (scalar @$child_nodes >= 1)
   {
      $higher_level_taxids{$t} = 1;
   }
}

# initialize some maps
my %acc_to_taxid_map = ();
my %taxid_to_sci_name_map = ();
my %taxid_to_kingdom_map = ();

# keep track of all the fhs
my %taxid_fh_map = ();

my $fasta_file = shift or print $usage and die($!);
my $blast_file = shift or print $usage and die($!);

while ($fasta_file && $blast_file)
{
   my %queries = ();
   
   open (my $fasta_fh, "<", $fasta_file) or die ("error: couldn't open FASTA file $fasta_file\n");
   open (my $blast_fh, "<", $blast_file) or die ("error: couldn't open BLAST file $blast_file\n");

   if ($blast_fh && $fasta_fh)
   {
      warn "parsing BLAST output file $blast_file\n";
      # read blast output file 
      # Keep track of the accs of the best hits for each query
      LINE: while (<$blast_fh>)
      {
         # TODO: keep track of best hit for each query
         if (/^\s*#/)
         {
            # ignore comment (first) lines
            next LINE;
         }
         chomp;
         my @fields = split "\t";
         if (scalar (@fields) >= 12)
         {
            # this is the format we expect
            # The order of fields for BLAST result in tabular format is: 
            # 0 query id, 1 database sequence (subject) id, 2 percent identity, 3 alignment length, 
            # 4 number of mismatches, 5 number of gap openings, 6 query start, 7 query end, 
            # 8 subject start, 9 subject end, 10 Expect value, 11 HSP bit score, 12 taxid
            # 13 & on: optional extra columns
      
            my $query = $fields[0];
            # warn "processing hit for query: $query\n";
            my $acc = $fields[1];
            my $bit_score = $fields[11];
            my $taxid = $fields[12];
      
            my $best_bitscore = $queries{$query}{best_bitscore};
            if ((!$best_bitscore) || ($bit_score >= $best_bitscore))
            {
               $queries{$query}{best_bitscore} = $bit_score;
               if ((!$best_bitscore) || ($bit_score > $best_bitscore))
               {
                  $queries{$query}{best_accs} = [ ];
                  $queries{$query}{best_taxids} = [ ];
               }
               push @{ $queries{$query}{best_accs} }, $acc;
               push @{ $queries{$query}{best_taxids} }, $taxid;
            }
         }
         else
         {
            warn ("ignoring line with unexpected number of fields in BLAST output: $_\n");
            # die ("unexpected number of fields in BLAST output: $_\n");
         }
      }
      
      warn "creating tallies for each taxid\n";
      # iterate through best hits and tally scores for TAXIDs
      my %taxid_tally = ();
      my $query_counter = 0;
      my $num_queries = scalar keys %queries;
      foreach my $query (keys %queries)
      {
         $query_counter += 1;
         my @accs = @{$queries{$query}{best_accs}};
         my @taxids = @{$queries{$query}{best_taxids}};
         my $number_hits = scalar (@accs);

         foreach my $taxid (@taxids)
         {
            if (!defined $taxid)
            {
               $taxid = "unknown_taxid";
            }

            # "normalized" tally
            $taxid_tally{$taxid} += (1/$number_hits);
         }
      }
      
      my @current_fhs = ();
      my @current_taxids = ();
      my $record_counter = 0;
      
      warn "filtering FASTA file\n";
      # now iterate through FASTA file, determine best acc(s) 
      # for each sequence and corresponding TAXIDs
      # and output if appropriate
      while (<$fasta_fh>)
      {
         chomp;
         my $current_line = $_;
         if (/^>/)
         {
            @current_fhs = ();
            @current_taxids = ();
      
            my $read_id = $_;
            # strip > char
            $read_id =~ s/>//;
            # strip everything after first space
            # because this is not output by blastn
            # in -m8 format blast output
            if ($read_id =~ /(\S+)\s+/)
            {
               $read_id = $1;
            }
      
            if ($queries{$read_id})
            {
               my @best_hit_accs = @{$queries{$read_id}{best_accs}};
               my @best_hit_taxids = @{$queries{$read_id}{best_taxids}};
               my @this_query_taxids = ();
               BEST_HIT_ACC: foreach my $taxid (@best_hit_taxids)
               {
                  # are we outputing everything, or
                  # is this one of the taxids we want to output
                  if ( (!$output_taxid_subset) or ($output_taxid_subset && $taxids_to_filter{$taxid}) )
                  {
                     push @this_query_taxids, $taxid;
                  }

                  # determine if this taxid is a descendent of a higher-level TAXID
                  # we are going to output
                  # First, Are we outputing any higher-level taxids?
                  if (scalar keys %higher_level_taxids)
                  {
                     my $parent_taxid = $taxid;
                     # is this taxid a descendent of a higher level taxid to be output?
                     # if so, aggregate to that 
                     while ($parent_taxid = taxid_info::taxid_parent($ncbi_tax_db, $parent_taxid))
                     {
                        # warn "PARENT: $parent_taxid\n";
                        if ($higher_level_taxids{$parent_taxid})
                        {
                           # this is a descendent 
                           push @this_query_taxids, $parent_taxid;
                        }
                        # 1 is root node of taxonomy tree
                        if ((!defined $parent_taxid) or ($parent_taxid == 1))
                        {
                           last;
                        }
                     }
                  }

                  # check to see if any taxids to be output.
                  if (!scalar @this_query_taxids)
                  {
                     next BEST_HIT_ACC;
                  }
      
                  # if we are only outputing TAXIDS with a certain number
                  # of hits, make sure we are above cutoff
                  if ($min_tally &&  ($taxid_tally{$taxid} <= $min_tally))
                  {
                     # warn "for read $read_id, count for taxid $taxid not above cutoff\n";
                     next BEST_HIT_ACC;
                  }
      
                  foreach my $this_query_taxid (@this_query_taxids)
                  {
                     foreach my $current_taxid (@current_taxids)
                     {
                        if ($current_taxid eq $this_query_taxid)
                        {
                           # we've already accounted for this TAXID in this
                           # hit.  Carry on.
                           next BEST_HIT_ACC;
                        }
                     }

                     # determine kingdom (according to NCBI taxonomy db)
                     my $kingdoms_hash_ref = taxid_to_kingdom::taxid_to_kingdom($ncbi_tax_db, $this_query_taxid);
                     my $kingdom = $kingdoms_hash_ref->{$this_query_taxid};
                     $taxid_to_kingdom_map{$this_query_taxid} = $kingdom;
      
                     # if only want virus hits, ignore non-virus hits
                     if ($virus_only and ((!defined $kingdom) or ($kingdom ne "V")))
                     {
                        # warn "ignoring non virus taxid $taxid\n";
                        next BEST_HIT_ACC;
                     }

                     # try to get scientific name of the taxid from local hash
                     my $sci_name = $taxid_to_sci_name_map{$this_query_taxid};
                     if (! $sci_name)
                     {
                        # get name from database
                        my $sci_name_hash_ref = taxid_to_name::taxid_to_scientific_name($ncbi_tax_db, $this_query_taxid);
                        $sci_name = $sci_name_hash_ref->{$this_query_taxid};
                        if (! $sci_name)
                        {
                           # TODO: handle this better?
                           $sci_name = "no_name";
                        }
                        # need to remove awkward characters (space, /, parentheses)from name string
                        # only allow roman A-Z, a-z, 0-9
                        $sci_name =~ s/[^A-Za-z0-9]/_/g;
                        $taxid_to_sci_name_map{$this_query_taxid} = $sci_name;
                     }
      
                     # keep track of the taxids we have already dealt with for this
                     # fasta record.  This is necessary because some sequences hit
                     # multiple db sequences that map back to the same TAXID (e.g.
                     # different isolates of the same virus)
                     push (@current_taxids, $this_query_taxid);
      
                     my $fh = $taxid_fh_map{$this_query_taxid};
                     if (!$fh)
                     {
                        my $filename = $fasta_file."_".$this_query_taxid."_".$sci_name.".fa";
                        open ($fh, ">", $filename) or die ("error couldn't open filehandle for file $filename\n");
                        $taxid_fh_map{$this_query_taxid} = $fh;
                     }
                     push @current_fhs, $fh;
                  }
               }
            }
            else
            {
               # there was no blast hit for this read
               # TODO: (maybe): output to file of no-hit fasta records
               # @current_fhs = ();
               # warn ("warning: no blast hit for read $read_id\n");
            }
         }
      
         if (!@current_fhs)
         {
            # output to stderr
            # warn "$current_line\n";
         }
         else
         {
            foreach my $fh (@current_fhs)
            {
               # print "printing line $_ to filehandle $fh\n"; 
               print $fh "$current_line\n";
            }
         }
      }
   }

   # try to get a couple more records
   $fasta_file = shift;
   $blast_file = shift;
}

# close fhs
foreach my $taxid (keys %taxid_fh_map)
{
   my $fh = $taxid_fh_map{$taxid};
   close ($fh);
}