Skip to content

Commit

Permalink
Merge pull request #72 from HRGV/dev
Browse files Browse the repository at this point in the history
Bumping master to tested Dev 3.3b1
  • Loading branch information
HRGV authored Jun 20, 2018
2 parents 0105de2 + 7073d78 commit 8a9d950
Show file tree
Hide file tree
Showing 19 changed files with 4,019 additions and 581 deletions.
172 changes: 125 additions & 47 deletions ENA_phyloFlash.pl
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,19 @@ =head1 NAME ENA_phyloFlash.pl - Download read files from ENA read archive and ru
=head1 SYNOPSIS
./ENA_phyloFlash.pl -acc [ENA ACCESSION] -phyloFlash -http_proxy="http://..."
ENA_phyloFlash.pl --acc [ENA ACCESSION] --phyloFlash --http_proxy="http://..."
./ENA_phyloFlash.pl -help
ENA_phyloFlash.pl --help
=head1 DESCRIPTION
Download Fastq files associated with an ENA Read Archive run accession number,
and run phyloFlash on those files if option -phyloFlash is chosen.
and run phyloFlash on those files if option I<--phyloFlash> is chosen.
Requires phyloFlash v3.0+
Requires phyloFlash v3.0+ and wget
=cut

use 5.010; # For the 'say' command
use strict;
use warnings;

Expand All @@ -30,12 +29,15 @@ =head1 DESCRIPTION
use Getopt::Long;
use POSIX;
use Pod::Usage;
use PhyloFlash qw(msg err @msg_log $VERSION);

# Input params
my $acc;
my $CPUs = 8; # Num processors - for nhmmer
my $download = 1; # By default, download
my $run_pF = 0; # By default, do not run phyloFlash
my $download_dir = '.';
my $cleanup;
my $run_pF; # By default, do not run phyloFlash
my $pF_dbhome;
my $http_proxy;

Expand All @@ -44,107 +46,183 @@ =head1 DESCRIPTION
# Get input from command line
GetOptions ("acc=s" => \$acc,
"CPUs=i" => \$CPUs,
"download!" => \$download,
"phyloFlash!" => \$run_pF,
"download|d!" => \$download,
"download_dir=s" => \$download_dir,
"cleanup" => \$cleanup,
"p|phyloFlash:s" => \$run_pF, # Colon means arguments are optional
"dbhome=s" => \$pF_dbhome,
"http_proxy=s" => \$http_proxy,
'help' => sub { pod2usage(1) },
'man' => sub { pod2usage(-exitval=>0, -verbose=>2) },
'help|h' => sub { pod2usage(1) },
'man' => sub { pod2usage(-exit=>0, -verbose=>2) },
'version|v' => sub { welcome(); exit(); },
) or pod2usage(2);

=head1 INPUT ARGUMENTS
=head1 ARGUMENTS
=over 8
=item -acc STRING
=item --acc STRING
Accession number of read run
Accession number of read run. Please ensure that this is the I<Run> accession
and not e.g. sample or study accession number. They are typically prefixed by
ERR..., SRR..., or DRR....
=item -CPUs INTEGER
=item --CPUs I<INTEGER>
Number of processors to use (passed to phyloFlash) (Default: 8)
Number of processors to use (passed to phyloFlash)
=item -download
Default: 8
Download files? Turn off with -nodownload (Default: Yes)
=item -d|--download
=item -phyloFlash
Download files? Turn off with -nodownload
Run phyloFlash? (Default: No, download only)
Default: Yes
=item -dbhome PATH
=item --download_dir I<PATH>
Path to download files.
Default: Current folder
=item --cleanup
Delete downloaded read files
Default: No
=item -p|--phyloFlash="I<ARGS>"
Run phyloFlash? If no additional arguments are supplied, run phyloFlash with
the I<--almosteverything> option. Otherwise the argument string is simply passed
to phyloFlash.
Default: No, download only
=item --dbhome I<PATH>
Path to phyloFlash database, otherwise use default, which is to look in folder
where phyloFlash script is located
=item -http_proxy URL
=item --http_proxy I<URL>
URL for http proxy - be sure to specify the protocol with "http://..."
(Default: none)
=item -help
Default: none
=item --help
Show help message
=item -man
=item --man
Show manual page in pager
=item --version
Print version number
=back
=cut

# Variables
my @fastq_urls; # List of Fastq URLs
my @fastq_basenames;
my @fastq_fullpaths;
my $phyloFlash = "$FindBin::RealBin/phyloFlash.pl"; # assume phyloFlash script in same folder
my $wget = "wget --no-verbose"; # Wget binary and options

## MAIN ########################################################################

welcome();

@fastq_urls = get_fastq_urls($acc);

if (@fastq_urls) {
say STDERR "Found the following Fastq files associated with accession $acc";
say STDERR join ("\n", @fastq_urls);
msg ("Found the following Fastq files associated with accession $acc");
msg (join ("\n", @fastq_urls));
}

# Check download path to make sure is a directory and is writable
if (defined $download_dir) {
if (! -d $download_dir) {
err ("Path to place downloaded files $download_dir is not a valid directory");
} elsif (! -w $download_dir) {
err ("You do not have write permissions to path $download_dir");
} else {
msg ("Downloading read files to folder $download_dir");
$wget .= " -P $download_dir"; # Add path option to wget command
}
}

# Check if any URLs were retrieved
if (defined $fastq_urls[0]) {
# Open log file to record details on this file
# Header line for log file
foreach my $fastq (@fastq_urls) {
system (join " ", ($wget, $fastq)) if ($download == 1);
if ($download == 1) {
msg ("Running wget with command: $wget $fastq");
system (join " ", ($wget, $fastq));
} else {
msg ("Skipping download, assuming that files are already present at path $download_dir");
}

# Strip URL dirs from filename, save to an array
my $filename = $1 if $fastq =~ m/([^\/]+)$/;
push @fastq_basenames, $filename;
}
}

# Run phyloFlash (v3.0beta1) to extract SSU reads
my @pF_args = ("-lib pF_$acc",
"-CPUs $CPUs",
"-emirge", # Run both SPAdes and EMIRGE
"-zip", # Output to archive
"-log",
);

# If specific dbhome specified, pass to phyloFlash, otherwise use default detected
push @pF_args, "-dbhome $pF_dbhome" if defined $pF_dbhome;

# Check how many Fastq files (paired or unpaired)
if (scalar (@fastq_basenames) == 1) {
push @pF_args, "-read1 ".$fastq_basenames[0];
} elsif (scalar (@fastq_basenames) == 2) {
push @pF_args, "-read1 ".$fastq_basenames[0];
push @pF_args, "-read2 ".$fastq_basenames[1];
# Add paths to Fastq file names
foreach my $file (@fastq_basenames) {
push @fastq_fullpaths, "$download_dir/$file";
}

system (join " ", ($phyloFlash, @pF_args)) if $run_pF == 1;
if (defined $run_pF) {
# Run phyloFlash (v3.0beta1+) to extract SSU reads
my @pF_args = ("-lib pF_$acc",
"-CPUs $CPUs",
);

if ($run_pF eq '') { # If defined but empty string
# Default if running phyloFlash
push @pF_args, '--almosteverything';
} else {
# User-specified arguments to pass to phyloFlash
push @pF_args, $run_pF;
}

# If specific dbhome specified, pass to phyloFlash, otherwise use default detected
push @pF_args, "-dbhome $pF_dbhome" if defined $pF_dbhome;

# Check how many Fastq files (paired or unpaired)
if (scalar (@fastq_fullpaths) == 1) {
push @pF_args, "-read1 ".$fastq_fullpaths[0];
} elsif (scalar (@fastq_basenames) == 2) {
push @pF_args, "-read1 ".$fastq_fullpaths[0];
push @pF_args, "-read2 ".$fastq_fullpaths[1];
}

my $pF_cmd = join " ", ($phyloFlash, @pF_args);
msg ("Running phyloFlash with command: $pF_cmd");
system (join " ", ($phyloFlash, @pF_args));
}

if (defined $cleanup) {
msg ("Deleting read files");
unlink @fastq_fullpaths if defined $cleanup; # Delete read files
}

msg ("Thank you for using ENA_phyloFlash.pl"); # Be nice

## SUBS ########################################################################

sub welcome {
my $message = "This is ENA_phyloFlash.pl from phyloFlash v$VERSION";
print STDERR $message;
print STDERR "\n";
}

sub get_fastq_urls {
# Get the URL(s) of ENA Fastq file(s) for a given ENA entry
Expand All @@ -154,7 +232,7 @@ sub get_fastq_urls {
my @urls_arr; # Output array containing URLs
# Get report table using ENA REST query
my $url = "http://www.ebi.ac.uk/ena/data/warehouse/filereport?accession=$acc&result=read_run&fields=run_accession,fastq_ftp,fastq_md5,fastq_bytes";
say STDERR $url;
msg ("Sending REST query to ENA at URL: $url");

my $ua = LWP::UserAgent->new;
if (defined $http_proxy) {
Expand All @@ -166,7 +244,7 @@ sub get_fastq_urls {
}

my $response = $ua->get($url);
die "Cannot get $url: ", $response->status_line unless $response->is_success;
err ("Cannot get $url: ". $response->status_line) unless $response->is_success;
my $tab = $response->content();

foreach my $line (split /\n/, $tab) {
Expand Down
Loading

0 comments on commit 8a9d950

Please sign in to comment.