diff --git a/Changes b/Changes index d2f3d7ae2..a8f209dce 100644 --- a/Changes +++ b/Changes @@ -1,6 +1,14 @@ LIST OF CHANGES --------------- +release 68.4.0 (2024-08-06) + - Ensured mark duplicate method can be inferred for a product with multiple + studies (tag zero). + - Upgrading tests + - Use contemporary run folders for tests (NovaSeqX) + - Clean fixtures + - Prevent tests from accessing live databases (reset HOME) + release 68.3.0 (2024-05-24) - Removing Tidyp dependency from CI - Added 'SampleSheet.csv' file from the top level of the run folder to diff --git a/MANIFEST b/MANIFEST index 7b1305230..c1334cdbc 100644 --- a/MANIFEST +++ b/MANIFEST @@ -124,10 +124,8 @@ t/50-npg_pipeline-daemon-archival.t t/bin/bkill t/bin/bresume t/bin/bsub -t/data/bam_flagstats/1234_1_bam_flagstats.json -t/data/bam_flagstats/1234_3_bam_flagstats.json -t/data/bam_flagstats/1234_3_phix_bam_flagstats.json -t/data/bam_flagstats/1234_4_bam_flagstats.json +t/data/bam_flagstats/47995_3_bam_flagstats.json +t/data/bam_flagstats/47995_3_phix_bam_flagstats.json t/data/barcodes/samplesheet_batch2015.csv t/data/barcodes/samplesheet_batch42225.csv t/data/barcodes/samplesheet_batch42225_amended1.csv @@ -229,6 +227,7 @@ t/data/miseq/16850_runParameters.xml t/data/miseq/16866_RunInfo.xml t/data/miseq/20990_RunInfo.xml t/data/miseq/24135_RunInfo.xml +t/data/miseq/24347_RunInfo.xml t/data/miseq/samplesheet_16850.csv t/data/miseq/samplesheet_16866.csv t/data/miseq/samplesheet_20990.csv @@ -1011,7 +1010,7 @@ t/data/novaseqx/47539/samplesheet_47539.csv t/data/novaseqx/20231017_LH00210_0012_B22FCNFLT3/RunInfo.xml t/data/novaseqx/20231017_LH00210_0012_B22FCNFLT3/RunParameters.xml t/data/novaseqx/20231017_LH00210_0012_B22FCNFLT3/samplesheet_47995.csv -t/data/p4_stage1_analysis/1234_samplesheet.csv +t/data/p4_stage1_analysis/samplesheet.csv t/data/p4_stage1_analysis/TileMetricsOut.bin t/data/portable_pipelines/ncov2019-artic-nf/cf01166c42a/product_release.yml t/data/portable_pipelines/ncov2019-artic-nf/cf01166c42a/product_release_no_pp.yml @@ -1061,17 +1060,6 @@ t/data/run_params/runParameters.hiseqx.upgraded.xml t/data/run_params/runParameters.miseq.xml t/data/run_params/runParameters.novaseq.xml t/data/run_params/RunParameters.novaseqx.xml -t/data/runfolder/archive/1234_2.bam -t/data/runfolder/archive/1234_2_human.bam -t/data/runfolder/archive/1234_3.bam -t/data/runfolder/archive/lane1/1234_1#15.bam -t/data/runfolder/archive/lane1/1234_1#15.cram -t/data/runfolder/archive/lane1/1234_1#15.seqchksum -t/data/runfolder/archive/lane1/1234_1#15_human.bam -t/data/runfolder/archive/lane1/1234_1#15_phix.bam -t/data/runfolder/archive/lane4/1234_4#16.bam -t/data/runfolder/archive/lane4/1234_4#32.bam -t/data/runfolder/Data/RunInfo.xml t/data/samplesheet_1234.csv t/data/samplesheet_8747.csv t/data/samplesheet_33990.csv diff --git a/README.md b/README.md index bb29f5db1..d3ec3a7bb 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ # NPG Pipelines for Processing Illumina Sequencing Data This software provides the Sanger NPG team's automation for analysing and -internally archiving Illumina sequencing on behalf of DNA Pipelines for their -customers. +internally archiving Illumina sequencing data on behalf of DNA Pipelines for +their customers. There are two main pipelines: @@ -18,16 +18,16 @@ sequencing flowcell, or each tagged library (within a pool on the flowcell). ## Batch Processing and Dependency Tracking with LSF or wr With this system, all of a pipeline's jobs for its steps are submitted for -execution to the LSF, or wr, batch/job processing system as the pipeline is +execution to `LSF` or `wr` batch/job processing system as the pipeline is initialised. As such, a _submitted_ pipeline does not have an orchestration script or daemon running: managing the runtime dependencies of jobs within an instance of a pipeline is delegated to the batch/job processing system. How is this done? The job representing the start point of a graph is submitted -to LSF, or wr, in a suspended state and is resumed once all other jobs have been +to `LSF` or `wr` in a suspended state and is resumed once all other jobs have been submitted thus ensuring that the execution starts only if all steps are -successfully submitted to LSF, or wr. If an error occurs at any point during job -submissions, all submitted jobs, apart from the start job, are killed. +successfully submitted. If an error occurs at any point during job submissions, +all submitted jobs, apart from the start job, are killed. ## Pipeline Creation @@ -84,8 +84,8 @@ The input for an instance of the pipeline is the instrument output run folder (BCL and associated files) and LIMS information which drives appropriate processing. -The key data products are aligned CRAM files and indexes, or unaligned CRAM -files. However per study (a LIMS datum) pipeline configuration allows for the +The key data products are aligned or unaligned CRAM files and indexes. +However per study (a LIMS datum) pipeline configuration allows for the creation of GATK gVCF files, or the running for external tool/pipeline e.g. ncov2012-artic-nf @@ -135,3 +135,28 @@ flow DAGs. Also, the [npg_irods](https://github.com/wtsi-npg/npg_irods) system is essential for the internal archival of data products. + +## Data Merging across Lanes of a Flowcell + +If the same library is sequenced in different lanes of a flowcell, under certain +conditions the pipeline will automatically merge all data for a library into +a single end product. Spiked-in PhiX libraries data and unassigned to any tags +data (tag zero) are not merged. The following scenarios trigger the merge: + +* NovaSeq Standard flowcell - a merge across all two or four lanes is performed. + +* Any flowcell run on a NovaSeqX instrument - if multiple lanes belong to the + same pool, the data from individual libraries will be merged across those + lanes. Thus the output of a NovaSeqX run might contain a mixture of merged and + unmerged products. + +If the data quality in a lane is poor, the lane should be excluded from the merge. +The `--process_separately_lanes` pipeline option is used to list lanes like this. +Usually this option is used when running the analysis pipeline. The pipeline caches +the supplied lane numbers so that the archival pipeline can generate a consistent +with the analysis pipeline list of data products. The same relates to the +`npg_run_is_deletable` script. The cached value is retrieved only if the +`--process_separately_lanes` argument was not set when any of these scripts are +invoked. + + diff --git a/lib/npg_pipeline/base.pm b/lib/npg_pipeline/base.pm index cd2841a4b..a9aae7848 100644 --- a/lib/npg_pipeline/base.pm +++ b/lib/npg_pipeline/base.pm @@ -7,7 +7,10 @@ use POSIX qw(strftime); use Math::Random::Secure qw{irand}; use List::MoreUtils qw{any uniq}; use File::Basename; +use JSON; +use Perl6::Slurp; use Readonly; +use Try::Tiny; use npg_tracking::glossary::rpt; use npg_tracking::glossary::composition::factory::rpt_list; @@ -23,6 +26,7 @@ with qw{ WTSI::DNAP::Utilities::Loggable npg_tracking::util::pipeline_config npg_pipeline::base::options + npg_pipeline::runfolder_scaffold }; Readonly::Array my @NO_SCRIPT_ARG_ATTRS => qw/ @@ -234,12 +238,13 @@ sub _build_merge_by_library { =head2 process_separately_lanes -An array of lane (position) numbers, which should not be merged with anyother +An array of lane (position) numbers, which should not be merged with any other lanes. To be used in conjunction with C or C -attributes. Does not have any impact if both of these attributes are false. +attributes. A consistency check is triggered when the value is set in order +to prevent this setting to be cached if no merge is intended. Defaults to an empty array value, meaning that all possible entities will be -merged. +merged. =cut @@ -247,9 +252,18 @@ has q{process_separately_lanes} => ( isa => q{ArrayRef}, is => q{ro}, default => sub { return []; }, + trigger => \&_validate_process_separately_lanes, documentation => q{Array of lane numbers, which have to be excluded from } . q{a merge}, ); +sub _validate_process_separately_lanes { + my ($self, $new_value) = @_; + if (!$self->merge_lanes && !$self->merge_by_library && (@{$new_value} != 0)) { + $self->logcroak('One of merge options should be enabled if ' . + 'process_separately_lanes is set'); + } + return; +} =head2 lims @@ -348,6 +362,21 @@ zero products, hashed under the 'data_products' key. If product_rpt_list attribute is set, the 'lanes' key maps to an empty array. +While computing the lists of data products, we examine whether data in any +of the lanes can be merged across lanes. Some of the lanes might be explicitly +excluded from the merge by setting the `process_separately_lanes` attribute +from the command line. This is likely to be done when the analysis pipeline +is run manually. Then the same lanes have to be excluded from the merge by +the archival pipeline and by the script that evaluates whether the run folder +can be deleted. To enable this, the value of the `process_separately_lanes` +attribute is saved to the metadate_cache_ directory immediately after +the pipeline establishes the location of the samplesheet file or generates a +new samplesheet. + +This method looks at the `process_separately_lanes` attribute first. If the +`process_separately_lanes` array is empty, an attempt to retrieve the cached +value is made. + =cut has q{products} => ( @@ -373,9 +402,14 @@ sub _build_products { } if ($self->merge_lanes || $self->merge_by_library) { + my $attr_name = 'process_separately_lanes'; + my $separate_lanes = $self->$attr_name; + if (@{$separate_lanes} == 0) { + $separate_lanes = $self->_cached_process_separately_lanes($attr_name); + } my $all_lims = $self->lims->aggregate_libraries( - \@lane_lims, $self->process_separately_lanes); + \@lane_lims, $separate_lanes); @data_lims = @{$all_lims->{'singles'}}; # Might be empty. # merge_lanes option implies a merge across all lanes. @@ -483,6 +517,27 @@ sub _check_lane_merge_is_viable { return 1; } +sub _cached_process_separately_lanes { + my ($self, $key) = @_; + $key or $self->logcroak('Key should be defined'); + + my $path = $self->analysis_options_file_path(); + if (-f $path) { + my $options; + try { + $options = decode_json(slurp($path)); + } catch { + $self->logcroak("Error reading or parsing ${path} : $_"); + }; + if ($options->{$key}) { + $self->info("Found $key analysis option in $path: " . + join q[, ], @{$options->{$key}}); + return $options->{$key}; + } + } + return []; +} + __PACKAGE__->meta->make_immutable; 1; @@ -511,6 +566,8 @@ __END__ =item File::Basename +=item JSON + =item Readonly =item npg_tracking::glossary::rpt @@ -538,7 +595,7 @@ Marina Gourtovaia =head1 LICENSE AND COPYRIGHT -Copyright (C) 2014,2015,2016,2017,2018,2019,2020,2023 Genome Research Ltd. +Copyright (C) 2014,2015,2016,2017,2018,2019,2020,2023,2024 Genome Research Ltd. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/lib/npg_pipeline/pluggable.pm b/lib/npg_pipeline/pluggable.pm index bbb88915b..6d4ebd3e1 100644 --- a/lib/npg_pipeline/pluggable.pm +++ b/lib/npg_pipeline/pluggable.pm @@ -22,8 +22,7 @@ use npg_pipeline::pluggable::registry; extends q{npg_pipeline::base}; with qw{ MooseX::AttributeCloner - npg_pipeline::executor::options - npg_pipeline::runfolder_scaffold }; + npg_pipeline::executor::options }; our $VERSION = '0'; @@ -883,7 +882,7 @@ __END__ =head1 LICENSE AND COPYRIGHT -Copyright (C) 2014,2015,2016,2017,2018,2019,2020,2021 Genome Research Ltd. +Copyright (C) 2014,2015,2016,2017,2018,2019,2020,2021,2024 Genome Research Ltd. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/lib/npg_pipeline/pluggable/central.pm b/lib/npg_pipeline/pluggable/central.pm index 5945502fa..ad5d512ba 100644 --- a/lib/npg_pipeline/pluggable/central.pm +++ b/lib/npg_pipeline/pluggable/central.pm @@ -3,6 +3,8 @@ package npg_pipeline::pluggable::central; use Moose; use MooseX::StrictConstructor; use namespace::autoclean; +use JSON; +use File::Slurp qw(read_file write_file); extends 'npg_pipeline::pluggable'; @@ -29,13 +31,17 @@ Pipeline runner for the analysis pipeline. Inherits from parent's method. Sets all paths needed during the lifetime of the analysis runfolder. Creates any of the paths that do not exist. +Saves lane numbers given by the `process_separately_lanes` option to a +JSON file. + =cut override 'prepare' => sub { my $self = shift; $self->_scaffold('create_top_level'); - super(); # Corect order + super(); # Correct order, sets up a samplesheet. + $self->_save_merge_options(); $self->_scaffold('create_product_level'); return; @@ -56,6 +62,40 @@ sub _scaffold { return; } +sub _save_merge_options { + my $self = shift; + + my $attr_name = 'process_separately_lanes'; + my @given_lanes = sort {$a <=> $b} @{$self->$attr_name}; + if (@given_lanes) { + my $cached_options = {}; + my $found = 0; + my $path = $self->analysis_options_file_path(); + if (-f $path) { + $cached_options = decode_json(read_file($path)); + if ($cached_options->{$attr_name} && @{$cached_options->{$attr_name}}) { + my $sep = q[, ]; + my $cached_lanes = join $sep, @{$cached_options->{$attr_name}}; + $self->info("Found cached merge options in $path: " . + "lanes $cached_lanes should not be merged."); + if ($cached_lanes ne join $sep, @given_lanes) { + $self->logcroak('Lane list from process_separately_lanes attribute ' . + 'is inconsistent with cached value'); + } + $found = 1; + } + } + + if (!$found) { + $cached_options->{$attr_name} = \@given_lanes; + write_file($path, encode_json($cached_options)) or + $self->logcroak("Failed to write to $path"); + } + } + + return; +} + __PACKAGE__->meta->make_immutable; 1; @@ -76,6 +116,10 @@ __END__ =item namespace::autoclean +=item JSON + +=item File::Slurp + =back =head1 INCOMPATIBILITIES @@ -89,7 +133,7 @@ Marina Gourtovaia =head1 LICENSE AND COPYRIGHT -Copyright (C) 2018 Genome Research Limited +Copyright (C) 2018,2024 Genome Research Ltd. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/lib/npg_pipeline/product/release.pm b/lib/npg_pipeline/product/release.pm index e240921bd..2d5a71ca2 100644 --- a/lib/npg_pipeline/product/release.pm +++ b/lib/npg_pipeline/product/release.pm @@ -6,6 +6,7 @@ use Data::Dump qw{pp}; use Moose::Role; use List::Util qw{all any}; use Readonly; +use Try::Tiny; with qw{WTSI::DNAP::Utilities::Loggable npg_tracking::util::pipeline_config}; @@ -300,7 +301,7 @@ sub bwakit_enable { Arg [1] : npg_pipeline::product Example : $obj->markdup_method($product); - Description: Return mark duplicate method, + Description: Returns mark duplicate method, the value might be undefined. Returntype : Str @@ -309,7 +310,22 @@ sub bwakit_enable { sub markdup_method { my ($self, $product) = @_; - return $self->find_study_config($product)->{markdup_method}; + + my $config; + try { + $config = $self->find_study_config($product); + } catch { + my $error = $_; + if ($error =~ /Multiple[ ]study[ ]ids/xms) { + $self->logwarn($error); + $self->logwarn('Falling back to the default section of the product config'); + $config = $self->default_study_config(); + } else { + $self->logcroak($error); + } + }; + + return defined $config ? $config->{markdup_method} : undef; } =head2 staging_deletion_delay @@ -412,6 +428,8 @@ study: =item Readonly +=item Try::Tiny + =item WTSI::DNAP::Utilities::Loggable =item npg_tracking::util::pipeline_config @@ -430,7 +448,7 @@ study: =head1 LICENSE AND COPYRIGHT -Copyright (C) 2018,2019,2020,2021,2022 Genome Research Ltd. +Copyright (C) 2018,2019,2020,2021,2022,2024 Genome Research Ltd. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/lib/npg_pipeline/runfolder_scaffold.pm b/lib/npg_pipeline/runfolder_scaffold.pm index d6bd451c5..710576cf0 100644 --- a/lib/npg_pipeline/runfolder_scaffold.pm +++ b/lib/npg_pipeline/runfolder_scaffold.pm @@ -15,6 +15,7 @@ Readonly::Scalar my $ANALYSIS_PATH_COMPONENT => q[/analysis/]; Readonly::Scalar my $LOG_DIR_NAME => q[log]; Readonly::Scalar my $STATUS_FILES_DIR_NAME => q[status]; Readonly::Scalar my $METADATA_CACHE_DIR_NAME => q[metadata_cache_]; +Readonly::Scalar my $ANALYSIS_OPTIONS_FILE_NAME => q[analysis_options.json]; Readonly::Scalar my $TILEVIZ_INDEX_DIR_NAME => q[tileviz]; Readonly::Scalar my $TILEVIZ_INDEX_FILE_NAME => q[index.html]; Readonly::Scalar my $IRODS_PUBLISHER_RSART_DIR_NAME => q[irods_publisher_restart_files]; @@ -149,6 +150,11 @@ sub metadata_cache_dir_path { return catdir($apath, $METADATA_CACHE_DIR_NAME . $self->id_run()); } +sub analysis_options_file_path { + my $self = shift; + return catfile($self->metadata_cache_dir_path, $ANALYSIS_OPTIONS_FILE_NAME); +} + sub irods_publisher_rstart_dir_path { my $self = shift; my $apath = $self->analysis_path; @@ -303,6 +309,11 @@ is empty. Can be called both as an instance and a class method. =head2 metadata_cache_dir_path +=head2 analysis_options_file_path + +A full path for a JSON file, which captures line numbers given by the +C pipeline attribute and other analysis options. + =head2 irods_publisher_rstart_dir_path =head2 irods_locations_dir_path @@ -355,7 +366,7 @@ Given a path in analysis directory changes it to outgoing directory. =head1 LICENSE AND COPYRIGHT -Copyright (C) 2018,2019,2020,2022 Genome Research Ltd. +Copyright (C) 2018,2019,2020,2022,2024 Genome Research Ltd. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/t/10-base.t b/t/10-base.t index 5039459de..3431cf80d 100644 --- a/t/10-base.t +++ b/t/10-base.t @@ -1,20 +1,24 @@ use strict; use warnings; -use Test::More tests => 8; +use Test::More tests => 10; use Test::Exception; use File::Temp qw(tempdir tempfile); use Cwd qw(getcwd abs_path); use Log::Log4perl qw(:levels); use Moose::Util qw(apply_all_roles); use File::Copy qw(cp); +use File::Slurp qw(read_file write_file); +use JSON; use t::util; my $util = t::util->new(); +my $temp_dir = $util->temp_directory(); +my $log_file = join q[/], $temp_dir, 'logfile'; Log::Log4perl->easy_init({layout => '%d %-5p %c - %m%n', level => $DEBUG, - file => join(q[/], $util->temp_directory(), 'logfile'), + file => $log_file, utf8 => 1}); my $cwd = abs_path(getcwd()); @@ -22,6 +26,11 @@ my $config_dir = $cwd . '/data/config_files'; use_ok(q{npg_pipeline::base}); +sub _generate_rpt { + my ($id_run, $lanes, $tag_index) = @_; + return join q[;], map { join q[:], $id_run, $_, $tag_index } @{$lanes}; +} + subtest 'local flag' => sub { plan tests => 3; @@ -63,6 +72,36 @@ subtest 'repository preexec' => sub { q{correct ref_adapter_pre_exec_string} ); }; +subtest 'label' => sub { + plan tests => 4; + + my $base = npg_pipeline::base->new(id_run => 22); + is ($base->label, '22', 'label defaults to run id'); + $base = npg_pipeline::base->new(id_run => 22, label => '33'); + is ($base->label, '33', 'label as set'); + $base = npg_pipeline::base->new(product_rpt_list => '22:1:33'); + throws_ok { $base->label } + qr/cannot build 'label' attribute, it should be pre-set/, + 'error if label is not preset'; + $base = npg_pipeline::base->new(product_rpt_list => '22:1:33', label => '33'); + is ($base->label, '33', 'label as set'); +}; + +subtest 'error on incompatible merge attributes' => sub { + plan tests => 1; + my $error = 'One of merge options should be enabled if ' . + 'process_separately_lanes is set'; + throws_ok { + $b = npg_pipeline::base->new( + runfolder_path => $temp_dir, + id_run => 999, + merge_lanes => 0, + merge_by_library => 0, + process_separately_lanes => [3,8] + ) + } qr/$error/, $error; +}; + subtest 'products - merging (or not) lanes' => sub { plan tests => 22; @@ -77,7 +116,11 @@ subtest 'products - merging (or not) lanes' => sub { local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = 't/data/products/samplesheet_novaseq4lanes.csv'; cp 't/data/run_params/runParameters.novaseq.xml', "$rf_path/runParameters.xml"; cp 't/data/novaseq/210111_A00513_0447_AHJ55JDSXY/RunInfo.xml', "$rf_path/RunInfo.xml"; - $b = npg_pipeline::base->new(runfolder_path => $rf_path, id_run => 999); + $b = npg_pipeline::base->new( + runfolder_path => $rf_path, + analysis_path => $temp_dir, + id_run => 47995 + ); ok ($b->merge_lanes, 'merge_lanes flag is set'); ok (!$b->_selected_lanes, 'selected_lanes flag is not set'); lives_ok {$products = $b->products} 'products hash created for NovaSeq run'; @@ -120,7 +163,7 @@ subtest 'products - merging (or not) lanes' => sub { }; subtest 'products - merging (or not) libraries' => sub { - plan tests => 423; + plan tests => 421; my $rf_info = $util->create_runfolder(); my $rf_path = $rf_info->{'runfolder_path'}; @@ -136,7 +179,11 @@ subtest 'products - merging (or not) libraries' => sub { # lanes 3, 4 - 10 samples # lanes 5, 6 - 22 samples # lanes 7, 8 - 38 samples - $b = npg_pipeline::base->new(runfolder_path => $rf_path, id_run => $id_run); + $b = npg_pipeline::base->new( + runfolder_path => $rf_path, + analysis_path => $temp_dir, + id_run => $id_run + ); ok($b->merge_by_library, 'merge by library is true for NovaSeqX'); my @lane_products = @{$b->products()->{'lanes'}}; is (@lane_products, 8, 'eight lane products'); @@ -202,7 +249,11 @@ subtest 'products - merging (or not) libraries' => sub { # Expect lanes 3 and 4 merged. $b = npg_pipeline::base->new( - runfolder_path => $rf_path, id_run => $id_run, lanes => [4,8,3]); + runfolder_path => $rf_path, + analysis_path => $temp_dir, + id_run => $id_run, + lanes => [4,8,3] + ); ok($b->merge_by_library, 'merge by library is true for NovaSeqX'); @lane_products = @{$b->products()->{'lanes'}}; @@ -274,37 +325,120 @@ subtest 'products - merging (or not) libraries' => sub { ); @products = @{$b->products()->{'data_products'}}; is (@products, 64, 'number of data products is 64'); +}; + +subtest 'products - retrieve cached merge options' => sub { + plan tests => 13; - $b = npg_pipeline::base->new( - runfolder_path => $rf_path, + my $id_run = 47995; + my $rf_name = q[20231017_LH00210_0012_B22FCNFLT3]; + my $rf_path_test = qq[t/data/novaseqx/$rf_name]; + local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = + qq[$rf_path_test/samplesheet_${id_run}.csv]; + + my $rf_info = $util->create_runfolder($temp_dir, {'runfolder_name' => $rf_name, + 'analysis_path' => 'BAM_basecalls_20240514-111105'}); + my $runfolder_path = $rf_info->{'runfolder_path'}; + for my $name (qw(RunInfo.xml RunParameters.xml)) { + cp("$rf_path_test/$name", "$runfolder_path/$name") or die "Failed to copy $name"; + } + my $bam_basecall_path = $rf_info->{'basecall_path'}; + my $metadata_cache_path = "$bam_basecall_path/metadata_cache_$id_run"; + mkdir $metadata_cache_path; + + # All lanes are spiked. Possible merges: + # lanes 1, 2 - 17 samples + # lanes 3, 4 - 10 samples + # lanes 5, 6 - 22 samples + # lanes 7, 8 - 38 samples + + # Establish the base cases - an inevitable repetition of some of the tests above. + # No merge + my $num_unmerged = 190; + my $b = npg_pipeline::base->new( + runfolder_path => $runfolder_path, + bam_basecall_path => $bam_basecall_path, id_run => $id_run, - lanes => [4,8,3], + merge_lanes => 0, merge_by_library => 0, - process_separately_lanes => [3,8] ); - lives_ok { @products = @{$b->products()->{'data_products'}} } - 'process_separately_lanes is compatible with suppressed merge'; - is (@products, 64, 'number of data products is 64'); -}; + my @products = @{$b->products()->{'data_products'}}; + is (@products, $num_unmerged, "number of unmerged data products is $num_unmerged"); -sub _generate_rpt { - my ($id_run, $lanes, $tag_index) = @_; - return join q[;], map { join q[:], $id_run, $_, $tag_index } @{$lanes}; -} + # Full merge by library + my $num_merged = 103; # Including unmerged spiked-in PhiX and tag0 + $b = npg_pipeline::base->new( + runfolder_path => $runfolder_path, + bam_basecall_path => $bam_basecall_path, + id_run => $id_run, + ); + ok ($b->merge_by_library, 'merge by library is enabled for NovaSeqX'); + @products = @{$b->products()->{'data_products'}}; + is (@products, $num_merged, "number of data products is $num_merged"); + # End of the base case -subtest 'label' => sub { - plan tests => 4; + # Supress the merge explicitly + $b = npg_pipeline::base->new( + runfolder_path => $runfolder_path, + bam_basecall_path => $bam_basecall_path, + id_run => $id_run, + process_separately_lanes => [(1 .. 8)], + ); + ok ($b->merge_by_library, 'merge by library is enabled for NovaSeqX'); + @products = @{$b->products()->{'data_products'}}; + is (@products, $num_unmerged, "number of data products is $num_unmerged"); - my $base = npg_pipeline::base->new(id_run => 22); - is ($base->label, '22', 'label defaults to run id'); - $base = npg_pipeline::base->new(id_run => 22, label => '33'); - is ($base->label, '33', 'label as set'); - $base = npg_pipeline::base->new(product_rpt_list => '22:1:33'); - throws_ok { $base->label } - qr/cannot build 'label' attribute, it should be pre-set/, - 'error if label is not preset'; - $base = npg_pipeline::base->new(product_rpt_list => '22:1:33', label => '33'); - is ($base->label, '33', 'label as set'); + my $cached_options_file = "$metadata_cache_path/analysis_options.json"; + write_file($cached_options_file, 'no JSON'); + $b = npg_pipeline::base->new( + runfolder_path => $runfolder_path, + bam_basecall_path => $bam_basecall_path, + id_run => $id_run, + ); + throws_ok { $b->products() } qr/Error reading or parsing $cached_options_file/, + 'Exception is thrown when the cached data cannot be read'; + + write_file($cached_options_file, encode_json({'option' => 'value'})) + or die "Failed writing to $cached_options_file"; + $b = npg_pipeline::base->new( + runfolder_path => $runfolder_path, + bam_basecall_path => $bam_basecall_path, + id_run => $id_run, + ); + lives_ok { @products = @{$b->products()->{'data_products'}} } + 'no error generating products'; + is (@products, $num_merged, "number of data products is $num_merged"); + + write_file($cached_options_file, + encode_json({'option' => 'value', 'process_separately_lanes' => [(1 .. 8)]})) + or die "Failed writing to $cached_options_file"; + $b = npg_pipeline::base->new( + runfolder_path => $runfolder_path, + bam_basecall_path => $bam_basecall_path, + id_run => $id_run, + ); + lives_ok { @products = @{$b->products()->{'data_products'}} } + 'no error generating products'; + is (@products, $num_unmerged, "number of data products is $num_unmerged"); + my @log_lines = read_file($log_file) or die "Failed to read the log $log_file"; + like ($log_lines[-1], + qr/Found process_separately_lanes analysis option/, 'information is logged'); + + write_file($cached_options_file, + encode_json({'process_separately_lanes' => [1,2,3]})) + or die "Failed writing to $cached_options_file"; + $b = npg_pipeline::base->new( + runfolder_path => $runfolder_path, + bam_basecall_path => $bam_basecall_path, + id_run => $id_run, + ); + my $num_partially_merged = 16 + 22 + 38 + (10+17)*2; + @products = @{$b->products()->{'data_products'}}; + is (@products, $num_partially_merged, + "number of data products is $num_partially_merged"); + @log_lines = read_file($log_file) or die "Failed to read the log $log_file"; + like ($log_lines[-1], qr/option in $cached_options_file: 1, 2, 3/, + 'information is logged'); }; 1; diff --git a/t/10-pluggable-central.t b/t/10-pluggable-central.t index 8d097660a..df016457d 100644 --- a/t/10-pluggable-central.t +++ b/t/10-pluggable-central.t @@ -1,10 +1,13 @@ use strict; use warnings; -use Test::More tests => 22; +use Test::More tests => 5; use Test::Exception; use Log::Log4perl qw(:levels); use File::Copy qw(cp); use File::Path qw(make_path); +use File::Temp qw(tempdir); +use File::Slurp qw(read_file write_file); +use JSON; use t::util; @@ -18,6 +21,7 @@ foreach my $tool (@tools) { } chmod 0755, @tools; local $ENV{'PATH'} = join q[:], $tdir, $ENV{'PATH'}; +local $ENV{'HOME'} = 't'; my $product_config = q[t/data/release/config/archive_on/product_release.yml]; @@ -26,74 +30,181 @@ Log::Log4perl->easy_init({layout => '%d %-5p %c - %m%n', file => join(q[/], $tdir, 'logfile'), utf8 => 1}); +my $test_data_dir_47995 = 't/data/novaseqx/20231017_LH00210_0012_B22FCNFLT3'; +sub _setup_runfolder_47995 { + my $tmp_dir = tempdir(CLEANUP => 1); + my @dirs = split q[/], $test_data_dir_47995; + my $rf_name = pop @dirs; + my $rf_info = $util->create_runfolder($tmp_dir, {'runfolder_name' => $rf_name}); + my $rf = $rf_info->{'runfolder_path'}; + for my $file (qw(RunInfo.xml RunParameters.xml)) { + if (cp("$test_data_dir_47995/$file", "$rf/$file") == 0) { + die "Failed to copy $file"; + } + } + return $rf_info; +} + my $central = q{npg_pipeline::pluggable::central}; use_ok($central); -my $runfolder_path = $util->analysis_runfolder_path(); +subtest 'test object creation' => sub { + plan tests => 4; -{ - local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = q[t/data/samplesheet_1234.csv]; - $util->create_analysis(); + local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = "$test_data_dir_47995/samplesheet_47995.csv"; my $pipeline; lives_ok { $pipeline = $central->new( - runfolder_path => $runfolder_path, + runfolder_path => $tdir, ); - } q{no croak creating new object}; + } q{no error creating new object}; isa_ok($pipeline, $central); -} -{ - local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = q[t/data/samplesheet_1234.csv]; - my $pb; lives_ok { - $pb = $central->new( + $pipeline = $central->new( function_order => [qw(qc_qX_yield qc_insert_size)], - runfolder_path => $runfolder_path, + runfolder_path => $tdir, ); - } q{no croak on creation}; - $util->create_analysis(); - is(join(q[ ], @{$pb->function_order()}), 'qc_qX_yield qc_insert_size', + } q{no error on creation}; + is(join(q[ ], @{$pipeline->function_order()}), 'qc_qX_yield qc_insert_size', 'function_order set on creation'); -} +}; + +subtest 'execute main() with a merge' => sub { + plan tests => 7; -{ local $ENV{CLASSPATH} = undef; - local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = q[t/data/samplesheet_1234.csv]; + local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = "$test_data_dir_47995/samplesheet_47995.csv"; + my $rf_info = _setup_runfolder_47995(); + my $config_dir = 'data/config_files'; + my $pb; - $util->create_analysis(); - cp 't/data/run_params/runParameters.hiseq.xml', - join(q[/], $runfolder_path, 'runParameters.xml'); + lives_ok { $pb = $central->new( + id_run => 47995, + function_order => [qw{qc_qX_yield qc_adapter update_ml_warehouse qc_insert_size}], + lanes => [3,4], + run_folder => $rf_info->{'runfolder_name'}, + runfolder_path => $rf_info->{'runfolder_path'}, + function_list => "$config_dir/function_list_central.json", + id_flowcell_lims => 17089, + no_bsub => 1, + repository => 't/data/sequence', + spider => 0, + product_conf_file_path => $product_config, + ); } q{no croak on new creation}; + + lives_ok { $pb->main() } q{no croak running qc->main()}; - $util->create_run_info(); + my $rf = $rf_info->{'runfolder_path'}; + my %dirs = + map { $_ => 1 } + map { /(lane.+\z)/ } + grep { -d } + glob "$rf/Data/Intensities/BAM_basecalls_*/no_cal/archive/lane*"; + is (scalar keys %dirs, 3, 'three directories for lanes 3 and 4 are created'); + # Presence of lane3-4 dir indicates that data from lanes 3 and 4 will be merged. + for my $name (qw(lane3 lane4 lane3-4)) { + ok (exists $dirs{$name}, "directory '$name' exists"); + } + my @files = grep { -f } + glob "$rf/Data/Intensities/BAM_basecalls_*/metadata_cache_47995/*.json"; + is (@files, 0, 'No JSON files in the metadata cache directory'); +}; + +subtest 'execute main() with merge supressed' => sub { + plan tests => 13; + + local $ENV{CLASSPATH} = undef; + my $samplesheet = "$test_data_dir_47995/samplesheet_47995.csv"; + my $rf_info = _setup_runfolder_47995(); + my $rf = $rf_info->{'runfolder_path'}; my $config_dir = 'data/config_files'; + my $init = { - function_order => [qw{qc_qX_yield qc_adapter update_ml_warehouse qc_insert_size}], - lanes => [4], - runfolder_path => $runfolder_path, - function_list => "$config_dir/function_list_central.json", - id_flowcell_lims => 2015, - no_bsub => 1, - repository => 't/data/sequence', - spider => 0, - no_sf_resource => 1, - product_conf_file_path => $product_config, + id_run => 47995, + function_order => [qw{qc_qX_yield qc_adapter update_ml_warehouse qc_insert_size}], + lanes => [3,4], + process_separately_lanes => [4,3], + run_folder => $rf_info->{'runfolder_name'}, + runfolder_path => $rf, + function_list => "$config_dir/function_list_central.json", + id_flowcell_lims => 17089, + no_bsub => 1, + repository => 't/data/sequence', + spider => 0, + product_conf_file_path => $product_config, }; + local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = $samplesheet; + my $pb = $central->new($init); + lives_ok { $pb->main() } 'no error running qc->main()'; + my $bam_basecall_path = $pb->bam_basecall_path(); + + my %dirs = + map { $_ => 1 } + map { /(lane.+\z)/ } + grep { -d } + glob "$rf/Data/Intensities/BAM_basecalls_*/no_cal/archive/lane*"; + is (scalar keys %dirs, 2, 'two directories for lanes 3 and 4 are created'); + # Absence of lane3-4 dir indicates that data from lanes 3 and 4 will + # not be merged. + for my $name (qw(lane3 lane4)) { + ok (exists $dirs{$name}, "directory '$name' exists"); + } + + my $file_with_cache = "$bam_basecall_path/metadata_cache_47995/analysis_options.json"; + ok (-f $file_with_cache, 'A file with cached no-merge options exists'); + is_deeply (decode_json(read_file($file_with_cache))->{'process_separately_lanes'}, + [3,4], 'no-merge options are correctly cached'); + + # Run once more. Reuse bam_basecalls directory. Expect no change. + local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = $samplesheet; + $init->{'bam_basecall_path'} = $bam_basecall_path; + $pb = $central->new($init); + lives_ok { $pb->main() } 'no error running qc->main() with the same options'; + ok (-f $file_with_cache, 'A file with cached no-merge options is retained'); - lives_ok { $pb = $central->new($init); } q{no croak on new creation}; - mkdir $pb->archive_path; - lives_ok { $pb->main() } q{no croak running qc->main()}; -} - -{ - local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = q[t/data/samplesheet_1234.csv]; - my $rf = join q[/], $tdir, 'myfolder'; - mkdir $rf; - cp 't/data/run_params/runParameters.hiseq.xml', - join(q[/], $rf, 'runParameters.xml'); + # Run once more with different lanes not to merge. + my $error = 'Lane list from process_separately_lanes attribute is ' . + 'inconsistent with cached value'; + $init->{'process_separately_lanes'} = [8,7]; + $pb = $central->new($init); + throws_ok { $pb->main() } qr/$error/, + 'error running qc->main() with different no-merge options'; + + # The file exists, but the no-merge option is not captured; + local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = $samplesheet; + $init->{'process_separately_lanes'} = [4,3]; + write_file($file_with_cache, + encode_json({'some option' => 'some option value'})); + $pb = $central->new($init); + lives_ok { $pb->main() } 'no error running qc->main()'; + is_deeply (decode_json(read_file($file_with_cache)), + {'some option' => 'some option value', 'process_separately_lanes' => [3,4]}, + 'the no-merged option has been added to the file' + ); + + # The file exists, the no-merge option is captured as an empty list. + write_file($file_with_cache, + encode_json({'option' => 'value', 'process_separately_lanes' => []})); + local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = $samplesheet; + $init->{'process_separately_lanes'} = [3,4]; + $pb = $central->new($init); + lives_ok { $pb->main() } 'no error running qc->main()'; + is_deeply (decode_json(read_file($file_with_cache)), + {'option' => 'value', 'process_separately_lanes' => [3,4]}, + 'the no-merged option has been added to the file' + ); +}; + +subtest 'execute prepare()' => sub { + plan tests => 12; + + local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = "$test_data_dir_47995/samplesheet_47995.csv"; + my $rf_info = _setup_runfolder_47995(); + my $rf = $rf_info->{'runfolder_path'}; my $init = { - id_run => 1234, - run_folder => 'myfolder', + id_run => 47995, + run_folder => $rf_info->{'runfolder_name'}, runfolder_path => $rf, timestamp => '22-May', spider => 0, @@ -103,14 +214,6 @@ my $runfolder_path = $util->analysis_runfolder_path(); my $pb = $central->new($init); is ($pb->intensity_path, "$rf/Data/Intensities", 'intensities path'); is ($pb->basecall_path, "$rf/Data/Intensities/BaseCalls", 'basecalls path'); - throws_ok { $pb->prepare() } - qr/does not exist, either bam_basecall_path or analysis_path should be given/, - q{error scaffolding the run folder}; - - make_path "$rf/Data/Intensities"; - $pb = $central->new($init); - is ($pb->intensity_path, "$rf/Data/Intensities", 'intensities path'); - is ($pb->basecall_path, "$rf/Data/Intensities/BaseCalls", 'basecalls path'); lives_ok { $pb->prepare() } 'prepare runs fine'; my $expected_pb_cal = join q[/],$rf,q{Data/Intensities/BAM_basecalls_22-May}; is ($pb->bam_basecall_path, $expected_pb_cal, 'bam basecall path is set'); @@ -137,6 +240,6 @@ my $runfolder_path = $util->analysis_runfolder_path(); $pb = $central->new($init); $pb->prepare(); is ($pb->bam_basecall_path, $expected_pb_cal, 'bam basecall path is set'); -} +}; 1; diff --git a/t/10-pluggable.t b/t/10-pluggable.t index f85967a94..c6b383306 100644 --- a/t/10-pluggable.t +++ b/t/10-pluggable.t @@ -23,6 +23,7 @@ foreach my $tool (@tools) { } chmod 0755, @tools; local $ENV{'PATH'} = join q[:], $test_bin, $ENV{'PATH'}; +local $ENV{'HOME'} = 't'; Log::Log4perl->easy_init({layout => '%d %-5p %c - %m%n', level => $DEBUG, diff --git a/t/10-runfolder_scaffold.t b/t/10-runfolder_scaffold.t index 880d1abf3..de30e890d 100644 --- a/t/10-runfolder_scaffold.t +++ b/t/10-runfolder_scaffold.t @@ -33,7 +33,7 @@ subtest 'tests for class methods' => sub { }; subtest 'top level scaffold' => sub { - plan tests => 10; + plan tests => 12; my $util = t::util->new(); my $rfh = $util->create_runfolder(); @@ -59,11 +59,16 @@ subtest 'top level scaffold' => sub { ok (-e $dir, 'archive directory created'); ok (-e "$dir/tileviz", 'tileviz index directory created'); ok (-e "$bbc_path/status", 'status directory created'); - ok (-e "$bbc_path/metadata_cache_999", 'metadata cache directory created'); + is ($rfs->metadata_cache_dir_path, "$bbc_path/metadata_cache_999"); + ok (-e $rfs->metadata_cache_dir_path, 'metadata cache directory created'); ok (-e "$bbc_path/irods_publisher_restart_files", 'directory for iRODS publisher restart files created'); ok (-e "$bbc_path/irods_locations_files", - 'directory for iRODS location json files created') + 'directory for iRODS location json files created'); + + is ($rfs->analysis_options_file_path, + "$bbc_path/metadata_cache_999/analysis_options.json", + 'file path for the analysis options cache') }; subtest 'product level scaffold, NovaSeq all lanes' => sub { diff --git a/t/15-pipeline_launcher_scripts.t b/t/15-pipeline_launcher_scripts.t index 24a89c8c5..5e6dabeb0 100644 --- a/t/15-pipeline_launcher_scripts.t +++ b/t/15-pipeline_launcher_scripts.t @@ -1,10 +1,12 @@ use strict; use warnings; -use English qw{-no_match_vars}; -use Test::More tests => 10; +use English qw(-no_match_vars); +use Test::More tests => 4; use Test::Exception; use File::Copy; use Cwd; +use File::Temp qw(tempdir); +use File::Path qw(make_path); use t::util; @@ -20,69 +22,98 @@ foreach my $tool (@tools) { } chmod 0755, @tools; -local $ENV{'PATH'} = join q[:], $tmp_dir, $bin, $ENV{'PATH'}; -local $ENV{'http_proxy'} = q[http://wibble]; -local $ENV{'no_proxy'} = q[]; +local $ENV{'PATH'} = join q[:], $tmp_dir, $bin, $ENV{'PATH'}; +local $ENV{'HOME'} = 't'; -my $rf = $util->analysis_runfolder_path; -my $bbp = "$rf/bam_basecall_path"; my $product_config = q[t/data/release/config/archive_on/product_release.yml]; +my $test_data_dir_47995 = 't/data/novaseqx/20231017_LH00210_0012_B22FCNFLT3'; + +sub _setup_runfolder_47995 { + my $tmp_dir = tempdir(CLEANUP => 1); + my @dirs = split q[/], $test_data_dir_47995; + my $rf_name = pop @dirs; + my $rf_info = $util->create_runfolder($tmp_dir, {'runfolder_name' => $rf_name}); + my $rf = $rf_info->{'runfolder_path'}; + for my $file (qw(RunInfo.xml RunParameters.xml)) { + if (copy("$test_data_dir_47995/$file", "$rf/$file") == 0) { + die "Failed to copy $file"; + } + } + return $rf_info; +} { + my $rf_info = _setup_runfolder_47995(); + my $rf = $rf_info->{'runfolder_path'}; local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = q{/does/not/exist.csv}; - $util->create_analysis(); - my $out = `$bin/npg_pipeline_central --product_conf_file_path $product_config --spider --no_bsub --no_sf_resource --runfolder_path $rf --function_order dodo 2>&1`; - like($out, - qr/Error initializing pipeline: Error while spidering/, - 'error in spidering when pre-set samplesheet does not exist'); + my $command = "$bin/npg_pipeline_central " . + "--product_conf_file_path $product_config --spider --no_bsub " . + "--runfolder_path $rf --function_order dodo 2>&1"; + note "Executing $command"; + like(`$command`, qr/Error initializing pipeline: Error while spidering/, + 'error in spidering when pre-set samplesheet does not exist'); } -{ - local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = q{t/data/samplesheet_1234.csv}; - $util->create_analysis(); - $util->create_run_info(); - - my $out = `$bin/npg_pipeline_central --product_conf_file_path $product_config --no-spider --no_bsub --no_sf_resource --runfolder_path $rf --bam_basecall_path $bbp --function_order dodo 2>&1`; - like($out, - qr/Function dodo cannot be found in the graph/, - 'error when function does not exist'); -} +subtest 'test analysis and archival pipeline scripts' => sub { + plan tests => 5; + + # A full run folder is scaffolded by the analysis pipeline. + # The archival pipeline is using teh same run folder. + + my $rf_info = _setup_runfolder_47995(); + my $rf = $rf_info->{'runfolder_path'}; + local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = "$test_data_dir_47995/samplesheet_47995.csv"; + my $command = "$bin/npg_pipeline_central " . + "--product_conf_file_path $product_config --no-spider --no_bsub " . + "--runfolder_path $rf --function_order create_summary_link_analysis " . + "--function_order dodo 2>&1"; + note "Executing $command"; + like(`$command`, qr/Function dodo cannot be found in the graph/, + 'error when function does not exist'); -{ - local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = q{t/data/samplesheet_1234.csv}; my $config_dir = join q[/], $tmp_dir, 'config'; - mkdir $config_dir; + make_path($config_dir); my @files = glob 'data/config_files/*.{json,ini}'; push @files, 't/data/release/config/archive_on/product_release.yml'; for (@files) { copy $_, $config_dir; } - lives_ok { qx{ - $bin/npg_pipeline_post_qc_review --no_bsub --no_sf_resource --runfolder_path $rf --bam_basecall_path $bbp --conf_path $config_dir};} - q{ran bin/npg_pipeline_post_qc_review}; - ok(!$CHILD_ERROR, qq{Return code of $CHILD_ERROR}); - - lives_ok { qx{ - $bin/npg_pipeline_post_qc_review --no_bsub --no_sf_resource --runfolder_path $rf --bam_basecall_path $bbp --function_list some --conf_path $config_dir}; } - q{ran bin/npg_pipeline_post_qc_review with non-exisiting function list}; - ok($CHILD_ERROR, qq{Child error $CHILD_ERROR}); -} - -{ - $util->create_analysis(); - $util->create_run_info(); - - lives_ok { qx{$bin/npg_pipeline_seqchksum_comparator --id_run=1234 --archive_path=$rf/Data/Intensities/BAM_basecalls_20140815-114817/no_cal/archive --bam_basecall_path=$rf/Data/Intensities/BAM_basecalls_20140815-114817 --lanes=1 };} q{ran bin/npg_pipeline_seqchksum_comparator with analysis and bam_basecall_path}; + $command = "$bin/npg_pipeline_post_qc_review --no_bsub --runfolder_path $rf " . + "--conf_path $config_dir"; + note "Executing $command"; + lives_ok { `$command` } 'ran bin/npg_pipeline_post_qc_review'; + ok(!$CHILD_ERROR, 'No error running command'); + + $command = "$bin/npg_pipeline_post_qc_review --no_bsub --runfolder_path $rf " . + "--conf_path $config_dir --function_list some"; + note "Executing $command"; + lives_ok { `$command` } + 'ran bin/npg_pipeline_post_qc_review with non-exisiting function list'; + ok($CHILD_ERROR, "Child error $CHILD_ERROR"); +}; + +subtest 'test npg_pipeline_seqchksum_comparator script' => sub { + plan tests => 2; + my $rf_info = _setup_runfolder_47995(); + my $rf = $rf_info->{'runfolder_path'}; + my $bbc = "$rf/Data/Intensities/BAM_basecalls"; + my $apath = "$bbc/no_cal/archive"; + make_path($apath); + my $command = "$bin/npg_pipeline_seqchksum_comparator --id_run=1234 " . + "--archive_path=$apath --bam_basecall_path=$bbc --lanes=1"; + note "Executing $command"; + lives_ok { `$command` } + 'ran npg_pipeline_seqchksum_comparator with analysis and bam_basecall_path'; ok($CHILD_ERROR, qq{Return code of $CHILD_ERROR as no files found}); -} +}; -{ +subtest 'test npg_pipeline_preexec_references script' => sub { + plan tests => 2; `bin/npg_pipeline_preexec_references --repository t/data/sequence/refs 2>/dev/null`; - ok( $CHILD_ERROR, qq{failed as could not locate references directory - $CHILD_ERROR} ); - - qx{bin/npg_pipeline_preexec_references --repository t/data/sequence}; - ok( ! $CHILD_ERROR, q{script runs OK} ); -} + ok($CHILD_ERROR, "failed as could not locate references directory - $CHILD_ERROR"); + `bin/npg_pipeline_preexec_references --repository t/data/sequence`; + ok(! $CHILD_ERROR, 'script runs OK'); +}; 1; diff --git a/t/20-function-cluster_count.t b/t/20-function-cluster_count.t index 51131d116..2423e1a44 100644 --- a/t/20-function-cluster_count.t +++ b/t/20-function-cluster_count.t @@ -1,10 +1,13 @@ use strict; use warnings; -use English qw{-no_match_vars}; -use Test::More tests => 27; +use English qw(-no_match_vars); +use Test::More tests => 23; use Test::Exception; use Log::Log4perl qw(:levels); use File::Copy qw(cp); +use File::Temp qw(tempdir); +use File::Path qw(make_path); + use t::util; use_ok( q{npg_pipeline::function::cluster_count} ); @@ -17,16 +20,33 @@ Log::Log4perl->easy_init({layout => '%d %-5p %c - %m%n', file => join(q[/], $dir, 'logfile'), utf8 => 1}); -$util->create_analysis(); -my $analysis_runfolder_path = $util->analysis_runfolder_path(); -my $bam_basecall_path = $util->standard_bam_basecall_path(); -my $recalibrated_path = $util->standard_analysis_recalibrated_path(); -my $archive_path = $recalibrated_path . q{/archive}; +my $test_data_dir_47995 = 't/data/novaseqx/20231017_LH00210_0012_B22FCNFLT3'; + +sub _setup_runfolder_47995 { + + my $tmp_dir = tempdir(CLEANUP => 1); + my @dirs = split q[/], $test_data_dir_47995; + my $rf_name = pop @dirs; + my $rf_info = $util->create_runfolder($tmp_dir, + {'runfolder_name' => $rf_name, 'analysis_path' => 'BAM_basecalls_20240508-204057'}); + my $rf = $rf_info->{'runfolder_path'}; + for my $file (qw(RunInfo.xml RunParameters.xml)) { + if (cp("$test_data_dir_47995/$file", "$rf/$file") == 0) { + die "Failed to copy $file"; + } + } + $rf_info->{'bam_basecall_path'} = $rf_info->{'analysis_path'}; + + my $archive_path = $rf_info->{'archive_path'}; + my @paths = map { "$archive_path/lane$_/qc" } (1 .. 8); + make_path(@paths); -cp 't/data/run_params/runParameters.miseq.xml', "$analysis_runfolder_path/runParameters.xml"; + my $nocall_path = $rf_info->{'nocal_path'}; + `touch $nocall_path/47995_bfs_fofn.txt`; + `touch $nocall_path/47995_sf_fofn.txt`; -`touch $recalibrated_path/1234_bfs_fofn.txt`; -`touch $recalibrated_path/1234_sf_fofn.txt`; + return $rf_info; +} my $default = { default => { @@ -36,152 +56,155 @@ my $default = { }; { - local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = q[t/data/samplesheet_1234.csv]; + my $rf_info = _setup_runfolder_47995(); + my $bam_basecall_path = $rf_info->{'bam_basecall_path'}; + my $runfolder_path = $rf_info->{'runfolder_path'}; + my $archive_path = $rf_info->{'archive_path'}; + my $id_run = 47995; + local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = "$test_data_dir_47995/samplesheet_47995.csv"; my $object; lives_ok { $object = npg_pipeline::function::cluster_count->new( - run_folder => q{123456_IL2_1234}, - runfolder_path => $analysis_runfolder_path, - bam_basecall_path => $bam_basecall_path, - id_run => 1234, - timestamp => q{20100907-142417}, + runfolder_path => $rf_info->{'runfolder_path'}, + bam_basecall_path => $rf_info->{'bam_basecall_path'}, + id_run => $id_run, is_indexed => 0, bfs_fofp_name => q{}, sf_fofp_name => q{}, - resource => $default + resource => $default, ); } q{obtain object ok}; - isa_ok( $object, q{npg_pipeline::function::cluster_count}); my $da = $object->create(); - ok ($da && @{$da} == 1, 'an array with one definition is returned for eight lanes (run-level check)'); - + ok ($da && @{$da} == 1, + 'an array with one definition is returned for eight lanes (run-level check)'); my $d = $da->[0]; is ($d->created_by, 'npg_pipeline::function::cluster_count', 'created_by is correct'); is ($d->created_on, $object->timestamp, 'created_on is correct'); - is ($d->identifier, 1234, 'identifier is set correctly'); + is ($d->identifier, $id_run, 'identifier is set correctly'); ok (!$d->excluded, 'step not excluded'); ok (!$d->has_composition, 'composition is not set'); lives_ok {$d->freeze()} 'definition can be serialized to JSON'; - - my $values = {}; - map {$values->{ref $_} += 1} @{$da}; - is ($values->{'npg_pipeline::function::definition'}, 1, - 'one definition object returned'); - - map {$values->{$_->job_name} += 1} @{$da}; - is ($values->{'npg_pipeline_check_cluster_count_1234_20100907-142417'}, 1, + like ($d->job_name, qr/\Anpg_pipeline_check_cluster_count_$id_run/, 'the job is named correctly'); - - map {$values->{$_->queue} += 1} @{$da}; - is ($values->{'default'}, 1, 'the queue is set to default for the definition'); - - my $command = sprintf q[npg_pipeline_check_cluster_count --id_run=1234 --lanes=1 --lanes=2 --lanes=3 --lanes=4 --lanes=5 --lanes=6 --lanes=7 --lanes=8 --bam_basecall_path=%s --runfolder_path=%s %s %s], $bam_basecall_path, $analysis_runfolder_path, join(q{ }, (map {qq[--bfs_paths=$archive_path/lane$_/qc]} (1..8))), join(q{ }, (map {qq[--sf_paths=$archive_path/lane$_/qc]} (1..8))); -# my $command = sprintf q[npg_pipeline_check_cluster_count --id_run=1234 --lanes=1 --lanes=2 --lanes=3 --lanes=4 --lanes=5 --lanes=6 --lanes=7 --lanes=8 --bam_basecall_path=%s --runfolder_path=%s --bfs_fofp_name=%s/1234_bfs_fofn.txt --sf_fofp_name=%s/1234_sf_fofn.txt], $bam_basecall_path, $analysis_runfolder_path, $recalibrated_path, $recalibrated_path; - + is ($d->queue, 'default', 'the queue is set to default for the definition'); + + my $bfs_paths = join q{ }, (map {qq[--bfs_paths=$archive_path/lane$_/qc]} (1..8)); + my $sf_paths = join q{ }, (map {qq[--sf_paths=$archive_path/lane$_/qc]} (1..8)); + my $command = sprintf q[npg_pipeline_check_cluster_count --id_run=%i ] . + q[--lanes=1 --lanes=2 --lanes=3 --lanes=4 --lanes=5 --lanes=6 --lanes=7 --lanes=8 ] . + q[--bam_basecall_path=%s --runfolder_path=%s %s %s], + $id_run, $bam_basecall_path, $runfolder_path, $bfs_paths, $sf_paths; is ($da->[0]->command, $command, 'correct command'); } { local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = q[t/data/samplesheet_8747.csv]; - my $analysis_runfolder_path = 't/data/example_runfolder/121103_HS29_08747_B_C1BV5ACXX'; - my $bam_basecall_path = "$analysis_runfolder_path/Data/Intensities/BAM_basecalls_20130122-085552"; + my $runfolder_path = 't/data/example_runfolder/121103_HS29_08747_B_C1BV5ACXX'; + my $bam_basecall_path = "$runfolder_path/Data/Intensities/BAM_basecalls_20130122-085552"; my $archive_path = "$bam_basecall_path/no_cal/archive"; - my $object; - lives_ok{ - $object = npg_pipeline::function::cluster_count->new( - id_run => 8747, - lanes => [1], - runfolder_path => $analysis_runfolder_path, - bam_basecall_path => $bam_basecall_path, - archive_path => $archive_path, - bfs_paths => [ qq[$archive_path/lane1/qc] ], - bfs_fofp_name => q{}, - sf_fofp_name => q{}, - resource => $default - ); - } q{obtain object ok}; - + my $object = npg_pipeline::function::cluster_count->new( + id_run => 8747, + lanes => [1], + runfolder_path => $runfolder_path, + bam_basecall_path => $bam_basecall_path, + archive_path => $archive_path, + bfs_paths => [ qq[$archive_path/lane1/qc] ], + bfs_fofp_name => q{}, + sf_fofp_name => q{}, + resource => $default, + ); lives_ok { $object->run_cluster_count_check(); - } q{check returns ok}; + } q{check runs ok}; } { - local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = q[t/data/samplesheet_1234.csv]; - my $object; - lives_ok{ - $object = npg_pipeline::function::cluster_count->new( - run_folder => q{123456_IL2_1234}, - runfolder_path => $analysis_runfolder_path, - bam_basecall_path => $bam_basecall_path, - archive_path => $archive_path, - bfs_paths => [ qq{$archive_path/lane3/qc} ], - bfs_fofp_name => q{}, - sf_fofp_name => q{}, - resource => $default - ); - } q{obtain object ok}; - + my $rf_info = _setup_runfolder_47995(); + my $bam_basecall_path = $rf_info->{'bam_basecall_path'}; + my $runfolder_path = $rf_info->{'runfolder_path'}; + my $archive_path = $rf_info->{'archive_path'}; + my $id_run = 47995; + local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = "$test_data_dir_47995/samplesheet_47995.csv"; + + my $object = npg_pipeline::function::cluster_count->new( + id_run => 47995, + runfolder_path => $runfolder_path, + bam_basecall_path => $bam_basecall_path, + archive_path => $archive_path, + bfs_paths => [ qq{$archive_path/lane3/qc} ], + bfs_fofp_name => q{}, + sf_fofp_name => q{}, + resource => $default + ); ok( !$object->_bam_cluster_count_total({}), 'no bam cluster count total returned'); my $is_indexed = 1; - qx{mkdir -p $archive_path/lane3/qc}; - qx{cp t/data/bam_flagstats/1234_3_bam_flagstats.json $archive_path/lane3/qc/1234_3#0_bam_flagstats.json}; - qx{cp t/data/bam_flagstats/1234_3_bam_flagstats.json $archive_path/lane3/qc/1234_3#1_bam_flagstats.json}; - - is( $object->_bam_cluster_count_total( {plex=>$is_indexed} ), 32, 'correct bam cluster count total for plexes'); - - qx{cp t/data/bam_flagstats/1234_3_phix_bam_flagstats.json $archive_path/lane3/qc/1234_3#0_phix_bam_flagstats.json}; - - is( $object->_bam_cluster_count_total( {plex=>$is_indexed} ), 46, 'correct bam cluster count total for plexes'); + cp("t/data/bam_flagstats/${id_run}_3_bam_flagstats.json", + "$archive_path/lane3/qc/${id_run}_3#0_bam_flagstats.json"); + cp("t/data/bam_flagstats/${id_run}_3_bam_flagstats.json", + "$archive_path/lane3/qc/${id_run}_3#1_bam_flagstats.json"); + is( $object->_bam_cluster_count_total( {plex=>$is_indexed} ), 32, + 'correct bam cluster count total for plexes'); + + cp("t/data/bam_flagstats/${id_run}_3_phix_bam_flagstats.json", + "$archive_path/lane3/qc/${id_run}_3#0_phix_bam_flagstats.json"); + is( $object->_bam_cluster_count_total( {plex=>$is_indexed} ), 46, + 'correct bam cluster count total for plexes'); } { local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = q[t/data/samplesheet_8747.csv]; - my $analysis_runfolder_path = 't/data/example_runfolder/121103_HS29_08747_B_C1BV5ACXX'; - my $bam_basecall_path = "$analysis_runfolder_path/Data/Intensities/BAM_basecalls_20130122-085552"; + my $runfolder_path = 't/data/example_runfolder/121103_HS29_08747_B_C1BV5ACXX'; + my $bam_basecall_path = "$runfolder_path/Data/Intensities/BAM_basecalls_20130122-085552"; my $archive_path = "$bam_basecall_path/no_cal/archive"; - my $object; - lives_ok{ - $object = npg_pipeline::function::cluster_count->new( - id_run => 8747, - lanes => [1], - runfolder_path => $analysis_runfolder_path, - bam_basecall_path => $bam_basecall_path, - archive_path => $archive_path, - bfs_paths => [ qq{$archive_path/lane1/qc} ], - sf_paths => [ qq{$archive_path/lane1/qc} ], - bfs_fofp_name => q{}, - sf_fofp_name => q{}, - resource => $default - ); - } q{obtain object ok}; + my $object = npg_pipeline::function::cluster_count->new( + id_run => 8747, + lanes => [1], + runfolder_path => $runfolder_path, + bam_basecall_path => $bam_basecall_path, + archive_path => $archive_path, + bfs_paths => [ qq{$archive_path/lane1/qc} ], + sf_paths => [ qq{$archive_path/lane1/qc} ], + bfs_fofp_name => q{}, + sf_fofp_name => q{}, + resource => $default + ); - is( $object->_bam_cluster_count_total({plex=>1}), 301389338, 'correct bam cluster count total'); - rename "$archive_path/lane1/qc/8747_1#0_bam_flagstats.json", "$archive_path/lane1/qc/8747_1#0_bam_flagstats.json.RENAMED"; - throws_ok {$object->run_cluster_count_check()} qr{Cluster count in bam files not as expected}, 'Cluster count in bam files not as expected'; - rename "$archive_path/lane1/qc/8747_1#0_bam_flagstats.json.RENAMED", "$archive_path/lane1/qc/8747_1#0_bam_flagstats.json"; + is( $object->_bam_cluster_count_total({plex=>1}), 301389338, + 'correct bam cluster count total'); + rename "$archive_path/lane1/qc/8747_1#0_bam_flagstats.json", + "$archive_path/lane1/qc/8747_1#0_bam_flagstats.json.RENAMED"; + throws_ok {$object->run_cluster_count_check()} + qr{Cluster count in bam files not as expected}, + 'Cluster count in bam files not as expected'; + rename "$archive_path/lane1/qc/8747_1#0_bam_flagstats.json.RENAMED", + "$archive_path/lane1/qc/8747_1#0_bam_flagstats.json"; ok($object->run_cluster_count_check(), 'Cluster count in bam files as expected'); } { local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = q[t/data/samplesheet_8747.csv]; - my $analysis_runfolder_path = 't/data/example_runfolder/121103_HS29_08747_B_C1BV5ACXX'; - my $bam_basecall_path = "$analysis_runfolder_path/Data/Intensities/BAM_basecalls_20130122-085552"; + my $runfolder_path = 't/data/example_runfolder/121103_HS29_08747_B_C1BV5ACXX'; + my $bam_basecall_path = "$runfolder_path/Data/Intensities/BAM_basecalls_20130122-085552"; my $archive_path = "$bam_basecall_path/no_cal/archive"; my $recalibrated_path = "$bam_basecall_path/no_cal"; my $common_command = sub { my $p = shift; - return sprintf q{$EXECUTABLE_NAME bin/npg_pipeline_check_cluster_count --bfs_fofp_name %s/lane%d/8747_bfs_fofn.txt --sf_fofp_name %s/lane%d/8747_sf_fofn.txt --id_run 8747 --bam_basecall_path %s --lanes %d --runfolder_path %s}, $archive_path, $p, $archive_path, $p, $bam_basecall_path, $p, $analysis_runfolder_path; + return sprintf q{$EXECUTABLE_NAME bin/npg_pipeline_check_cluster_count } . + q{--bfs_fofp_name %s/lane%d/8747_bfs_fofn.txt } . + q{--sf_fofp_name %s/lane%d/8747_sf_fofn.txt --id_run 8747 } . + q{--bam_basecall_path %s --lanes %d --runfolder_path %s}, + $archive_path, $p, $archive_path, $p, $bam_basecall_path, $p, + $runfolder_path; }; my $c; diff --git a/t/20-function-p4_stage1_analysis.t b/t/20-function-p4_stage1_analysis.t index 12ac49760..1a87a2963 100644 --- a/t/20-function-p4_stage1_analysis.t +++ b/t/20-function-p4_stage1_analysis.t @@ -1,12 +1,12 @@ use strict; use warnings; -use Test::More tests => 5; +use Test::More tests => 4; use Test::Exception; -use Cwd qw(getcwd abs_path); use File::Copy qw(cp); use File::Copy::Recursive qw(dircopy); use Perl6::Slurp; use JSON; +use File::Temp qw(tempdir); use t::util; @@ -14,15 +14,6 @@ my $util = t::util->new(clean_temp_directory => 1); my $dir = $util->temp_directory(); use_ok('npg_pipeline::function::p4_stage1_analysis'); -my $current = abs_path(getcwd()); - -# Copy cache dir to a temp location since a tag file will -# be created there. -my $new = "$dir/1234_samplesheet.csv"; -`cp -r t/data/p4_stage1_analysis/* $dir`; -local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = $new; -local $ENV{'http_proxy'} = 'http://wibble.com'; -local $ENV{'no_proxy'} = q{}; my $default = { default => { @@ -44,59 +35,71 @@ my $repos_root = $dir . q{/srpipe_references}; `touch $repos_root/references/PhiX/default/all/fasta/phix_unsnipped_short_no_N.fa`; `touch $repos_root/references/PhiX/default/all/minimap2/phix_unsnipped_short_no_N.fa.mmi`; -$util->create_analysis(); -my $runfolder = $util->analysis_runfolder_path() . '/'; -cp('t/data/runfolder/Data/RunInfo.xml', $runfolder) or die 'Failed to copy run info'; -cp('t/data/run_params/runParameters.miseq.xml', $runfolder . 'runParameters.xml') or - die 'Failed to copy run params'; - -my $bc_path = q{/nfs/sf45/IL2/analysis/123456_IL2_1234/Data/Intensities/BaseCalls}; - -my $bam_generator = npg_pipeline::function::p4_stage1_analysis->new( - run_folder => q{123456_IL2_1234}, +sub _create_runfolder { + my $rf_name = shift; + + my $tdir = tempdir(CLEANUP => 1); + my $id_run = 24347; + my $rf_info = $util->create_runfolder($tdir, + {'runfolder_name' => $rf_name, analysis_path => 'BAM_basecalls'}); + my $runfolder = $rf_info->{'runfolder_path'}; + my $bam_basecall_path = $rf_info->{'analysis_path'}; + + cp('t/data/miseq/24347_RunInfo.xml', "$runfolder/RunInfo.xml") + or die 'Failed to copy run info'; + cp('t/data/run_params/runParameters.miseq.xml', "$runfolder/runParameters.xml") + or die 'Failed to copy run params'; + + my $interop_dir = join q[/], $runfolder, 'InterOp'; + mkdir $interop_dir; + my $tm_file = 'TileMetricsOut.bin'; + cp("t/data/p4_stage1_analysis/$tm_file", "$interop_dir/$tm_file") + or die 'Failed to copy the InterOp file'; + mkdir join(q[/], $bam_basecall_path , "metadata_cache_${id_run}") + or die 'Failed to create directory'; + + return $rf_info; +} + +subtest 'check saving arguments' => sub { + plan tests => 25; + + my $id_run = 24347; + my $rf_name = '171114_MS6_24347_A_MS5534842-300V2'; + my $run_info = _create_runfolder($rf_name); + my $runfolder = $run_info->{'runfolder_path'}; + my $bam_basecall_path = $run_info->{'analysis_path'}; + my $no_cal_path = join q[/], $bam_basecall_path, 'no_cal'; + my $intensities_dir = $run_info->{'intensity_path'}; + my $timestamp = '20240514'; + + local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = 't/data/p4_stage1_analysis/samplesheet.csv'; + + my $bam_generator = npg_pipeline::function::p4_stage1_analysis->new( repository => $repos_root, - runfolder_path => $util->analysis_runfolder_path(), - timestamp => q{20090709-123456}, + runfolder_path => $runfolder, verbose => 0, - id_run => 1234, + id_run => $id_run, _extra_tradis_transposon_read => 1, - bam_basecall_path => $util->standard_bam_basecall_path(), - resource => $default -); - -mkdir join(q[/], $bam_generator->bam_basecall_path(), 'metadata_cache_1234') - or die 'Failed to create directory'; - -subtest 'basics' => sub { - plan tests => 5; - + bam_basecall_path => $bam_basecall_path, + resource => $default, + timestamp => $timestamp, + ); isa_ok($bam_generator, q{npg_pipeline::function::p4_stage1_analysis}, q{$bam_generator}); is($bam_generator->_extra_tradis_transposon_read, 1, 'TraDIS set'); $bam_generator->_extra_tradis_transposon_read(0); is($bam_generator->_extra_tradis_transposon_read, 0, 'TraDIS not set'); isa_ok($bam_generator->lims, 'st::api::lims', 'cached lims object'); - my $alims = $bam_generator->lims->children_ia; - my $position = 8; - is($bam_generator->_get_number_of_plexes_excluding_control($alims->{$position}), - 2, 'correct number of plexes'); -}; - -subtest 'check_save_arguments' => sub { - plan tests => 29; - - my $bbp = $bam_generator->bam_basecall_path; - my $unique = $bam_generator->_job_id(); my $da = $bam_generator->generate(); - ok ($da && @{$da}==8, 'eight definitions returned'); + ok ($da && @{$da}==1, 'one definition is returned'); my $d = $da->[0]; isa_ok ($d, 'npg_pipeline::function::definition'); is ($d->created_by, 'npg_pipeline::function::p4_stage1_analysis', 'created by'); - is ($d->created_on, q{20090709-123456}, 'created on'); - is ($d->identifier, 1234, 'identifier'); + is ($d->identifier, $id_run, 'identifier'); ok (!$d->excluded, 'step is not excluded'); is ($d->queue, 'p4stage1', 'special queue'); - is ($d->job_name, 'p4_stage1_analysis_1234_20090709-123456', 'job name'); + like ($d->job_name, qr/\Ap4_stage1_analysis_$id_run/, 'job name'); is ($d->fs_slots_num, 4, '4 sf slots'); is ($d->num_hosts, 1, 'one host'); is_deeply ($d->num_cpus, [8], 'num cpus as an array'); @@ -109,73 +112,70 @@ subtest 'check_save_arguments' => sub { isa_ok ($composition, 'npg_tracking::glossary::composition'); is ($composition->num_components, 1, 'one component'); my $component = $composition->get_component(0); - is ($component->id_run, 1234, 'run id correct'); + is ($component->id_run, $id_run, 'run id correct'); is ($component->position, 1, 'position correct'); ok (!defined $component->tag_index, 'tag index undefined'); - my $intensities_dir = $dir . '/nfs/sf45/IL2/analysis/123456_IL2_1234/Data/Intensities'; - my $expected = { - '1' => 'bash -c \' cd ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane1/log && vtfp.pl -template_path $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib -o run_1234_1.json -param_vals ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane1/param_files/1234_1_p4s1_pv_in.json -export_param_vals 1234_1_p4s1_pv_out_' . $unique . '.json -keys cfgdatadir -vals $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib/ -keys aligner_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -2 --divide 3` -keys s2b_mt_val -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -1 --divide 3` -keys bamsormadup_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --divide 3` -keys br_numthreads_val -vals `npg_pipeline_job_env_to_threads --num_threads 8` $(dirname $(dirname $(readlink -f $(which vtfp.pl))))/data/vtlib/bcl2bam_phix_deplex_wtsi_stage1_template.json && viv.pl -s -x -v 3 -o viv_1234_1.log run_1234_1.json \'', - '2' => 'bash -c \' cd ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane2/log && vtfp.pl -template_path $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib -o run_1234_2.json -param_vals ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane2/param_files/1234_2_p4s1_pv_in.json -export_param_vals 1234_2_p4s1_pv_out_' . $unique . '.json -keys cfgdatadir -vals $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib/ -keys aligner_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -2 --divide 3` -keys s2b_mt_val -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -1 --divide 3` -keys bamsormadup_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --divide 3` -keys br_numthreads_val -vals `npg_pipeline_job_env_to_threads --num_threads 8` $(dirname $(dirname $(readlink -f $(which vtfp.pl))))/data/vtlib/bcl2bam_phix_deplex_wtsi_stage1_template.json && viv.pl -s -x -v 3 -o viv_1234_2.log run_1234_2.json \'', - '3' => 'bash -c \' cd ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane3/log && vtfp.pl -template_path $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib -o run_1234_3.json -param_vals ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane3/param_files/1234_3_p4s1_pv_in.json -export_param_vals 1234_3_p4s1_pv_out_' . $unique . '.json -keys cfgdatadir -vals $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib/ -keys aligner_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -2 --divide 3` -keys s2b_mt_val -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -1 --divide 3` -keys bamsormadup_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --divide 3` -keys br_numthreads_val -vals `npg_pipeline_job_env_to_threads --num_threads 8` $(dirname $(dirname $(readlink -f $(which vtfp.pl))))/data/vtlib/bcl2bam_phix_deplex_wtsi_stage1_template.json && viv.pl -s -x -v 3 -o viv_1234_3.log run_1234_3.json \'', - '4' => 'bash -c \' cd ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane4/log && vtfp.pl -template_path $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib -o run_1234_4.json -param_vals ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane4/param_files/1234_4_p4s1_pv_in.json -export_param_vals 1234_4_p4s1_pv_out_' . $unique . '.json -keys cfgdatadir -vals $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib/ -keys aligner_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -2 --divide 3` -keys s2b_mt_val -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -1 --divide 3` -keys bamsormadup_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --divide 3` -keys br_numthreads_val -vals `npg_pipeline_job_env_to_threads --num_threads 8` $(dirname $(dirname $(readlink -f $(which vtfp.pl))))/data/vtlib/bcl2bam_phix_deplex_wtsi_stage1_template.json && viv.pl -s -x -v 3 -o viv_1234_4.log run_1234_4.json \'', - '5' => 'bash -c \' cd ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane5/log && vtfp.pl -template_path $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib -o run_1234_5.json -param_vals ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane5/param_files/1234_5_p4s1_pv_in.json -export_param_vals 1234_5_p4s1_pv_out_' . $unique . '.json -keys cfgdatadir -vals $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib/ -keys aligner_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -2 --divide 3` -keys s2b_mt_val -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -1 --divide 3` -keys bamsormadup_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --divide 3` -keys br_numthreads_val -vals `npg_pipeline_job_env_to_threads --num_threads 8` $(dirname $(dirname $(readlink -f $(which vtfp.pl))))/data/vtlib/bcl2bam_phix_deplex_wtsi_stage1_template.json && viv.pl -s -x -v 3 -o viv_1234_5.log run_1234_5.json \'', - '6' => 'bash -c \' cd ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane6/log && vtfp.pl -template_path $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib -o run_1234_6.json -param_vals ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane6/param_files/1234_6_p4s1_pv_in.json -export_param_vals 1234_6_p4s1_pv_out_' . $unique . '.json -keys cfgdatadir -vals $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib/ -keys aligner_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -2 --divide 3` -keys s2b_mt_val -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -1 --divide 3` -keys bamsormadup_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --divide 3` -keys br_numthreads_val -vals `npg_pipeline_job_env_to_threads --num_threads 8` $(dirname $(dirname $(readlink -f $(which vtfp.pl))))/data/vtlib/bcl2bam_phix_deplex_wtsi_stage1_template.json && viv.pl -s -x -v 3 -o viv_1234_6.log run_1234_6.json \'', - '7' => 'bash -c \' cd ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane7/log && vtfp.pl -template_path $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib -o run_1234_7.json -param_vals ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane7/param_files/1234_7_p4s1_pv_in.json -export_param_vals 1234_7_p4s1_pv_out_' . $unique . '.json -keys cfgdatadir -vals $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib/ -keys aligner_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -2 --divide 3` -keys s2b_mt_val -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -1 --divide 3` -keys bamsormadup_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --divide 3` -keys br_numthreads_val -vals `npg_pipeline_job_env_to_threads --num_threads 8` $(dirname $(dirname $(readlink -f $(which vtfp.pl))))/data/vtlib/bcl2bam_phix_deplex_wtsi_stage1_template.json && viv.pl -s -x -v 3 -o viv_1234_7.log run_1234_7.json \'', - '8' => 'bash -c \' cd ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane8/log && vtfp.pl -template_path $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib -o run_1234_8.json -param_vals ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane8/param_files/1234_8_p4s1_pv_in.json -export_param_vals 1234_8_p4s1_pv_out_' . $unique . '.json -keys cfgdatadir -vals $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib/ -keys aligner_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -2 --divide 3` -keys s2b_mt_val -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -1 --divide 3` -keys bamsormadup_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --divide 3` -keys br_numthreads_val -vals `npg_pipeline_job_env_to_threads --num_threads 8` $(dirname $(dirname $(readlink -f $(which vtfp.pl))))/data/vtlib/bcl2bam_phix_deplex_wtsi_stage1_template.json && viv.pl -s -x -v 3 -o viv_1234_8.log run_1234_8.json \'', - }; - - foreach my $d (@{$da}) { - my $p = $d->composition()->get_component(0)->position(); - is ($d->command, $expected->{$p}, "command correct for lane $p"); - } - - my $pfname = $intensities_dir . q[/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane1/param_files/1234_1_p4s1_pv_in.json]; + my $p4stage1_dir = "$bam_basecall_path/p4_stage1_analysis"; + my $unique = $bam_generator->_job_id(); + my $expected = q(bash -c ' cd ) . $p4stage1_dir . '/lane1/log && vtfp.pl -template_path ' . + '$(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib ' . + "-o run_${id_run}_1.json -param_vals " . $p4stage1_dir . + "/lane1/param_files/${id_run}_1_p4s1_pv_in.json -export_param_vals ${id_run}_1_p4s1_pv_out_${unique}.json " . + '-keys cfgdatadir -vals $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib/ ' . + '-keys aligner_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -2 --divide 3` ' . + '-keys s2b_mt_val -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -1 --divide 3` ' . + '-keys bamsormadup_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --divide 3` ' . + '-keys br_numthreads_val -vals `npg_pipeline_job_env_to_threads --num_threads 8` ' . + '$(dirname $(dirname $(readlink -f $(which vtfp.pl))))/data/vtlib/bcl2bam_phix_deplex_wtsi_stage1_template.json ' . + "&& viv.pl -s -x -v 3 -o viv_${id_run}_1.log run_${id_run}_1.json " . q('); + + is ($da->[0]->command, $expected, 'command for lane 1'); + + my $pfname = $p4stage1_dir . "/lane1/param_files/${id_run}_1_p4s1_pv_in.json"; ok (-e $pfname, 'params file exists'); my $h = from_json(slurp($pfname)); - my $no_cal_path = $intensities_dir . '/BAM_basecalls_09-07-2009/no_cal'; - $expected = { 'assign' => [ { 'i2b_thread_count' => 8, - 'seqchksum_file' => $intensities_dir . '/BAM_basecalls_09-07-2009/1234_1.post_i2b.seqchksum', - 'scramble_reference_fasta' => $dir . '/srpipe_references/references/PhiX/default/all/fasta/phix_unsnipped_short_no_N.fa', - 'i2b_rg' => '1234_1', - 'i2b_pu' => '123456_IL2_1234_1', + 'seqchksum_file' => $bam_basecall_path . "/${id_run}_1.post_i2b.seqchksum", + 'scramble_reference_fasta' => $repos_root . '/references/PhiX/default/all/fasta/phix_unsnipped_short_no_N.fa', + 'i2b_rg' => "${id_run}_1", + 'i2b_pu' => "${rf_name}_1", 'tileviz_dir' => $no_cal_path . '/archive/lane1/tileviz', - 'reference_phix' => $dir . "/srpipe_references/references/PhiX/default/all/bwa0_6/phix_unsnipped_short_no_N.fa", - 'unfiltered_cram_file' => $no_cal_path . '/1234_1.unfiltered.cram', + 'reference_phix' => $repos_root . "/references/PhiX/default/all/bwa0_6/phix_unsnipped_short_no_N.fa", + 'unfiltered_cram_file' => $no_cal_path . "/${id_run}_1.unfiltered.cram", 'qc_check_qc_out_dir' => $no_cal_path . '/archive/lane1/qc', 'i2b_lane' => '1', 'bwa_executable' => 'bwa0_6', - 'filtered_bam' => $no_cal_path . '/1234_1.bam', + 'filtered_bam' => "${no_cal_path}/${id_run}_1.bam", 'samtools_executable' => 'samtools', 'i2b_library_name' => '51021', 'outdatadir' => $no_cal_path, 'subsetsubpath' => $no_cal_path . '/archive/lane1/.npg_cache_10000', - 'i2b_run_path' => $dir . q[/nfs/sf45/IL2/analysis/123456_IL2_1234], + 'i2b_run_path' => $runfolder, 'teepot_tempdir' => '.', 'split_prefix' => $no_cal_path, 'i2b_intensity_dir' => $intensities_dir, 'i2b_sample_aliases' => 'SRS000147', 'phix_alignment_method' => 'bwa_aln_se', - 'md5filename' => $no_cal_path . '/1234_1.bam.md5', + 'md5filename' => "${no_cal_path}/${id_run}_1.bam.md5", 'teepot_mval' => '2G', - 'i2b_runfolder' => '123456_IL2_1234', + 'i2b_runfolder' => $rf_name, 'i2b_study_name' => '"SRP000031: 1000Genomes Project Pilot 1"', 'i2b_basecalls_dir' => $intensities_dir . '/BaseCalls', 'teepot_wval' => '500', - 'qc_check_qc_in_dir' => $intensities_dir . '/BAM_basecalls_09-07-2009', - 'qc_check_id_run' => '1234', + 'qc_check_qc_in_dir' => $bam_basecall_path, + 'qc_check_id_run' => $id_run, 'cluster_count' => '500077065', - 'seed_frac' => '1234.00002000', + 'seed_frac' => "${id_run}.00002000", 'split_threads_val' => 4, 'aln_filter_value' => '0x900', 's1_se_pe' => 'se', 's1_output_format' => 'cram', - 'rpt_list' => '1234:1', + 'rpt_list' => "${id_run}:1", 'lane_archive_path' => $no_cal_path . '/archive/lane1', }, ], @@ -187,118 +187,96 @@ subtest 'check_save_arguments' => sub { }; is_deeply($h, $expected, 'correct json file content (for p4 stage1 params file)'); - }; -# check_save_arguments_minimap2 test duplicates check_save_arguments, but forces phix_aligment_method to minimap2 -$bam_generator = npg_pipeline::function::p4_stage1_analysis->new( - run_folder => q{123456_IL2_1234}, +subtest 'check_save_arguments_minimap2' => sub { + plan tests => 3; + + my $id_run = 24347; + my $rf_name = '171114_MS6_24347_A_MS5534842-300V2'; + my $run_info = _create_runfolder($rf_name); + my $runfolder = $run_info->{'runfolder_path'}; + my $bam_basecall_path = $run_info->{'analysis_path'}; + my $no_cal_path = "${bam_basecall_path}/no_cal"; + my $intensities_dir = $run_info->{'intensity_path'}; + my $p4stage1_dir = "${bam_basecall_path}/p4_stage1_analysis"; + + local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = 't/data/p4_stage1_analysis/samplesheet.csv'; + + # check_save_arguments_minimap2 test duplicates check_save_arguments, + # but forces phix_aligment_method to minimap2 + my $bam_generator = npg_pipeline::function::p4_stage1_analysis->new( + run_folder => $rf_name, repository => $repos_root, - runfolder_path => $util->analysis_runfolder_path(), - timestamp => q{20090709-123456}, + runfolder_path => $runfolder, verbose => 0, - id_run => 1234, - bam_basecall_path => $util->standard_bam_basecall_path(), + id_run => $id_run, + bam_basecall_path => $bam_basecall_path, p4s1_phix_alignment_method => q{minimap2}, resource => $default ); -subtest 'check_save_arguments_minimap2' => sub { - plan tests => 29; - - my $bbp = $bam_generator->bam_basecall_path; my $unique = $bam_generator->_job_id(); + my $expected = q(bash -c ' cd ) . $p4stage1_dir . '/lane1/log && vtfp.pl ' . + '-template_path $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib ' . + "-o run_${id_run}_1.json -param_vals $p4stage1_dir/lane1/param_files/${id_run}_1_p4s1_pv_in.json " . + "-export_param_vals ${id_run}_1_p4s1_pv_out_${unique}.json " . + '-keys cfgdatadir -vals $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib/ ' . + '-keys aligner_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -2 --divide 3` ' . + '-keys s2b_mt_val -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -1 --divide 3` ' . + '-keys bamsormadup_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --divide 3` ' . + '-keys br_numthreads_val -vals `npg_pipeline_job_env_to_threads --num_threads 8` ' . + '$(dirname $(dirname $(readlink -f $(which vtfp.pl))))/data/vtlib/bcl2bam_phix_deplex_wtsi_stage1_template.json ' . + "&& viv.pl -s -x -v 3 -o viv_${id_run}_1.log run_${id_run}_1.json" . q( '); my $da = $bam_generator->generate(); - ok ($da && @{$da}==8, 'eight definitions returned'); - my $d = $da->[0]; - isa_ok ($d, 'npg_pipeline::function::definition'); - is ($d->created_by, 'npg_pipeline::function::p4_stage1_analysis', 'created by'); - is ($d->created_on, q{20090709-123456}, 'created on'); - is ($d->identifier, 1234, 'identifier'); - ok (!$d->excluded, 'step is not excluded'); - is ($d->queue, 'p4stage1', 'special queue'); - is ($d->job_name, 'p4_stage1_analysis_1234_20090709-123456', 'job name'); - is ($d->fs_slots_num, 4, '4 sf slots'); - is ($d->num_hosts, 1, 'one host'); - is_deeply ($d->num_cpus, [8], 'num cpus as an array'); - is ($d->memory, 20000, 'memory'); - is ($d->command_preexec, - "npg_pipeline_preexec_references --repository $repos_root", - 'preexec command'); - ok ($d->has_composition, 'composition object is set'); - my $composition = $d->composition; - isa_ok ($composition, 'npg_tracking::glossary::composition'); - is ($composition->num_components, 1, 'one component'); - my $component = $composition->get_component(0); - is ($component->id_run, 1234, 'run id correct'); - is ($component->position, 1, 'position correct'); - ok (!defined $component->tag_index, 'tag index undefined'); + is ($da->[0]->command, $expected, "command correct for lane 1"); - my $intensities_dir = $dir . '/nfs/sf45/IL2/analysis/123456_IL2_1234/Data/Intensities'; - my $expected = { - '1' => 'bash -c \' cd ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane1/log && vtfp.pl -template_path $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib -o run_1234_1.json -param_vals ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane1/param_files/1234_1_p4s1_pv_in.json -export_param_vals 1234_1_p4s1_pv_out_' . $unique . '.json -keys cfgdatadir -vals $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib/ -keys aligner_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -2 --divide 3` -keys s2b_mt_val -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -1 --divide 3` -keys bamsormadup_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --divide 3` -keys br_numthreads_val -vals `npg_pipeline_job_env_to_threads --num_threads 8` $(dirname $(dirname $(readlink -f $(which vtfp.pl))))/data/vtlib/bcl2bam_phix_deplex_wtsi_stage1_template.json && viv.pl -s -x -v 3 -o viv_1234_1.log run_1234_1.json \'', - '2' => 'bash -c \' cd ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane2/log && vtfp.pl -template_path $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib -o run_1234_2.json -param_vals ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane2/param_files/1234_2_p4s1_pv_in.json -export_param_vals 1234_2_p4s1_pv_out_' . $unique . '.json -keys cfgdatadir -vals $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib/ -keys aligner_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -2 --divide 3` -keys s2b_mt_val -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -1 --divide 3` -keys bamsormadup_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --divide 3` -keys br_numthreads_val -vals `npg_pipeline_job_env_to_threads --num_threads 8` $(dirname $(dirname $(readlink -f $(which vtfp.pl))))/data/vtlib/bcl2bam_phix_deplex_wtsi_stage1_template.json && viv.pl -s -x -v 3 -o viv_1234_2.log run_1234_2.json \'', - '3' => 'bash -c \' cd ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane3/log && vtfp.pl -template_path $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib -o run_1234_3.json -param_vals ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane3/param_files/1234_3_p4s1_pv_in.json -export_param_vals 1234_3_p4s1_pv_out_' . $unique . '.json -keys cfgdatadir -vals $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib/ -keys aligner_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -2 --divide 3` -keys s2b_mt_val -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -1 --divide 3` -keys bamsormadup_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --divide 3` -keys br_numthreads_val -vals `npg_pipeline_job_env_to_threads --num_threads 8` $(dirname $(dirname $(readlink -f $(which vtfp.pl))))/data/vtlib/bcl2bam_phix_deplex_wtsi_stage1_template.json && viv.pl -s -x -v 3 -o viv_1234_3.log run_1234_3.json \'', - '4' => 'bash -c \' cd ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane4/log && vtfp.pl -template_path $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib -o run_1234_4.json -param_vals ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane4/param_files/1234_4_p4s1_pv_in.json -export_param_vals 1234_4_p4s1_pv_out_' . $unique . '.json -keys cfgdatadir -vals $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib/ -keys aligner_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -2 --divide 3` -keys s2b_mt_val -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -1 --divide 3` -keys bamsormadup_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --divide 3` -keys br_numthreads_val -vals `npg_pipeline_job_env_to_threads --num_threads 8` $(dirname $(dirname $(readlink -f $(which vtfp.pl))))/data/vtlib/bcl2bam_phix_deplex_wtsi_stage1_template.json && viv.pl -s -x -v 3 -o viv_1234_4.log run_1234_4.json \'', - '5' => 'bash -c \' cd ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane5/log && vtfp.pl -template_path $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib -o run_1234_5.json -param_vals ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane5/param_files/1234_5_p4s1_pv_in.json -export_param_vals 1234_5_p4s1_pv_out_' . $unique . '.json -keys cfgdatadir -vals $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib/ -keys aligner_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -2 --divide 3` -keys s2b_mt_val -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -1 --divide 3` -keys bamsormadup_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --divide 3` -keys br_numthreads_val -vals `npg_pipeline_job_env_to_threads --num_threads 8` $(dirname $(dirname $(readlink -f $(which vtfp.pl))))/data/vtlib/bcl2bam_phix_deplex_wtsi_stage1_template.json && viv.pl -s -x -v 3 -o viv_1234_5.log run_1234_5.json \'', - '6' => 'bash -c \' cd ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane6/log && vtfp.pl -template_path $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib -o run_1234_6.json -param_vals ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane6/param_files/1234_6_p4s1_pv_in.json -export_param_vals 1234_6_p4s1_pv_out_' . $unique . '.json -keys cfgdatadir -vals $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib/ -keys aligner_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -2 --divide 3` -keys s2b_mt_val -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -1 --divide 3` -keys bamsormadup_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --divide 3` -keys br_numthreads_val -vals `npg_pipeline_job_env_to_threads --num_threads 8` $(dirname $(dirname $(readlink -f $(which vtfp.pl))))/data/vtlib/bcl2bam_phix_deplex_wtsi_stage1_template.json && viv.pl -s -x -v 3 -o viv_1234_6.log run_1234_6.json \'', - '7' => 'bash -c \' cd ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane7/log && vtfp.pl -template_path $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib -o run_1234_7.json -param_vals ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane7/param_files/1234_7_p4s1_pv_in.json -export_param_vals 1234_7_p4s1_pv_out_' . $unique . '.json -keys cfgdatadir -vals $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib/ -keys aligner_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -2 --divide 3` -keys s2b_mt_val -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -1 --divide 3` -keys bamsormadup_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --divide 3` -keys br_numthreads_val -vals `npg_pipeline_job_env_to_threads --num_threads 8` $(dirname $(dirname $(readlink -f $(which vtfp.pl))))/data/vtlib/bcl2bam_phix_deplex_wtsi_stage1_template.json && viv.pl -s -x -v 3 -o viv_1234_7.log run_1234_7.json \'', - '8' => 'bash -c \' cd ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane8/log && vtfp.pl -template_path $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib -o run_1234_8.json -param_vals ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane8/param_files/1234_8_p4s1_pv_in.json -export_param_vals 1234_8_p4s1_pv_out_' . $unique . '.json -keys cfgdatadir -vals $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib/ -keys aligner_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -2 --divide 3` -keys s2b_mt_val -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -1 --divide 3` -keys bamsormadup_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --divide 3` -keys br_numthreads_val -vals `npg_pipeline_job_env_to_threads --num_threads 8` $(dirname $(dirname $(readlink -f $(which vtfp.pl))))/data/vtlib/bcl2bam_phix_deplex_wtsi_stage1_template.json && viv.pl -s -x -v 3 -o viv_1234_8.log run_1234_8.json \'', - }; - - foreach my $d (@{$da}) { - my $p = $d->composition()->get_component(0)->position(); - is ($d->command, $expected->{$p}, "command correct for lane $p"); - } - - my $pfname = $bbp . q[/p4_stage1_analysis/lane1/param_files/1234_1_p4s1_pv_in.json]; + my $pfname = "${p4stage1_dir}/lane1/param_files/${id_run}_1_p4s1_pv_in.json"; ok (-e $pfname, 'params file exists'); my $h = from_json(slurp($pfname)); - my $no_cal_path = $intensities_dir . '/BAM_basecalls_09-07-2009/no_cal'; - $expected = { 'assign' => [ { 'i2b_thread_count' => 8, - 'seqchksum_file' => $intensities_dir . '/BAM_basecalls_09-07-2009/1234_1.post_i2b.seqchksum', - 'scramble_reference_fasta' => $dir . '/srpipe_references/references/PhiX/default/all/fasta/phix_unsnipped_short_no_N.fa', - 'i2b_rg' => '1234_1', - 'i2b_pu' => '123456_IL2_1234_1', + 'seqchksum_file' => "$bam_basecall_path/${id_run}_1.post_i2b.seqchksum", + 'scramble_reference_fasta' => $repos_root . '/references/PhiX/default/all/fasta/phix_unsnipped_short_no_N.fa', + 'i2b_rg' => "${id_run}_1", + 'i2b_pu' => "${rf_name}_1", 'tileviz_dir' => $no_cal_path . '/archive/lane1/tileviz', - 'reference_phix' => $dir . '/srpipe_references/references/PhiX/default/all/minimap2/phix_unsnipped_short_no_N.fa.mmi', - 'unfiltered_cram_file' => $no_cal_path . '/1234_1.unfiltered.cram', + 'reference_phix' => $repos_root . '/references/PhiX/default/all/minimap2/phix_unsnipped_short_no_N.fa.mmi', + 'unfiltered_cram_file' => "${no_cal_path}/${id_run}_1.unfiltered.cram", 'qc_check_qc_out_dir' => $no_cal_path . '/archive/lane1/qc', 'i2b_lane' => '1', 'bwa_executable' => 'bwa0_6', - 'filtered_bam' => $no_cal_path . '/1234_1.bam', + 'filtered_bam' => "${no_cal_path}/${id_run}_1.bam", 'samtools_executable' => 'samtools', 'i2b_library_name' => '51021', 'outdatadir' => $no_cal_path, 'subsetsubpath' => $no_cal_path . '/archive/lane1/.npg_cache_10000', - 'i2b_run_path' => $dir . q[/nfs/sf45/IL2/analysis/123456_IL2_1234], + 'i2b_run_path' => $runfolder, 'teepot_tempdir' => '.', 'split_prefix' => $no_cal_path, 'i2b_intensity_dir' => $intensities_dir, 'i2b_sample_aliases' => 'SRS000147', 'phix_alignment_method' => 'minimap2', - 'md5filename' => $no_cal_path . '/1234_1.bam.md5', + 'md5filename' => "${no_cal_path}/${id_run}_1.bam.md5", 'teepot_mval' => '2G', - 'i2b_runfolder' => '123456_IL2_1234', + 'i2b_runfolder' => $rf_name, 'i2b_study_name' => '"SRP000031: 1000Genomes Project Pilot 1"', 'i2b_basecalls_dir' => $intensities_dir . '/BaseCalls', 'teepot_wval' => '500', - 'qc_check_qc_in_dir' => $intensities_dir . '/BAM_basecalls_09-07-2009', - 'qc_check_id_run' => '1234', + 'qc_check_qc_in_dir' => $bam_basecall_path, + 'qc_check_id_run' => $id_run, 'cluster_count' => '500077065', - 'seed_frac' => '1234.00002000', + 'seed_frac' => "${id_run}.00002000", 'split_threads_val' => 4, 'aln_filter_value' => '0x900', 's1_se_pe' => 'se', 's1_output_format' => 'cram', 'lane_archive_path' => $no_cal_path . '/archive/lane1', - 'rpt_list' => '1234:1', + 'rpt_list' => "${id_run}:1", }, ], 'ops' => { @@ -309,26 +287,21 @@ subtest 'check_save_arguments_minimap2' => sub { }; is_deeply($h, $expected, 'correct json file content (for p4 stage1 params file)'); - }; - -# check_duplex-seq test +}; subtest 'check_duplex-seq' => sub { plan tests => 29; - my $rf_name = '210111_A00513_0447_AHJ55JDSXY'; - my $rfpath = abs_path(getcwd) . qq{/t/data/novaseq/$rf_name}; - my $copy = join q[/], $dir, $rf_name; - dircopy $rfpath, $copy or die 'Failed to copy run folder'; - $rfpath = $copy; - my $id_run = 36062; + my $rf_name = '210111_A00513_0447_AHJ55JDSXY'; + my $rfpath = join q[/], $dir, $rf_name; + dircopy qq{t/data/novaseq/$rf_name}, $rfpath or die 'Failed to copy run folder'; my $bbp = qq{$rfpath/Data/Intensities/BAM_basecalls_20210113-092146}; local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = qq{$bbp/metadata_cache_36062/samplesheet_36062.csv}; - $bam_generator = npg_pipeline::function::p4_stage1_analysis->new( + my $bam_generator = npg_pipeline::function::p4_stage1_analysis->new( run_folder => $rf_name, repository => $repos_root, runfolder_path => $rfpath, @@ -572,7 +545,7 @@ subtest 'check_duplex-seq' => sub { }; is_deeply($h, $expected, 'correct json file content (for p4 stage1 params file)'); - }; +}; 1; diff --git a/t/20-function-seq_alignment.t b/t/20-function-seq_alignment.t index 8cf940f31..3bf528666 100644 --- a/t/20-function-seq_alignment.t +++ b/t/20-function-seq_alignment.t @@ -1,6 +1,6 @@ use strict; use warnings; -use Test::More tests => 20; +use Test::More tests => 21; use Test::Exception; use Test::Deep; use Test::Warn; @@ -13,7 +13,7 @@ use Log::Log4perl qw/:levels/; use JSON; use Cwd; use List::Util qw/first/; -use File::Slurp qw/edit_file_lines/; +use File::Slurp qw/edit_file_lines read_file write_file/; use Moose::Util qw(apply_all_roles); @@ -1502,7 +1502,7 @@ subtest 'miseq_primer_panel_only' => sub { is ($d->command(), $command, 'correct command for MiSeq lane 24135_1 tag index 1'); }; -subtest 'product_release_tests' => sub { +subtest 'product_release_tests and mark duplicate method' => sub { plan tests => 269; my %test_runs = ( @@ -1572,6 +1572,47 @@ subtest 'product_release_tests' => sub { } }; +subtest 'mark duplicate method for a product with multiple studies' => sub { + plan tests => 3; + + my $runfolder_path = join q[/], $dir, q[markdups_test]; + mkdir $runfolder_path; + copy('t/data/miseq/46761_RunInfo.xml', "$runfolder_path/RunInfo.xml") or die 'Copy failed'; + copy('t/data/miseq/46761_runParameters.xml', "$runfolder_path/runParameters.xml") + or die 'Copy failed'; + my @lines = read_file(q[t/data/miseq/samplesheet_46761_bwa_mem2.csv]); + my @data = (); + # Change study ID for the first tag. + foreach my $value ((split q[,], $lines[2])) { + $value =~ s/5556/5557/; + push @data, $value; + } + $lines[2] = join q[,], @data; + my $samplesheet = "$runfolder_path/samplesheet_46761.csv"; + write_file($samplesheet, @lines); + local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = $samplesheet; + + my $ms_gen = npg_pipeline::function::seq_alignment->new( + id_run => 46761, + runfolder_path => $runfolder_path, + conf_path => 't/data/release/config/seq_alignment', + resource => $default, + npg_tracking_schema => undef + ); + my $product; + foreach my $p (@{$ms_gen->products->{data_products}}) { + if ($p->rpt_list eq '46761:1:0') { + $product = $p; + last; + } + } + + is ($product->lims->study_ids, 2, 'tag zero product has two study ids'); + my $method; + lives_ok { $method = $ms_gen->markdup_method($product) } + 'no error calling markdup_method'; + is ($method, 'biobambam', 'correct method'); +}; # test overrides of bwa_mem with bwa-mem2 # 1) on sample sheet entry without [bwa_mem2] specified in reference name # 2) on sample sheet entry without [bwa_mem2] specified in reference name, but setting bwa_mem2 attribute diff --git a/t/20-function-seqchksum_comparator.t b/t/20-function-seqchksum_comparator.t index 645c4d9a6..7ff617a81 100644 --- a/t/20-function-seqchksum_comparator.t +++ b/t/20-function-seqchksum_comparator.t @@ -1,39 +1,57 @@ use strict; use warnings; -use Test::More tests => 17; +use Test::More tests => 15; use Test::Exception; use Log::Log4perl qw(:levels); +use File::Path qw(make_path); +use File::Copy; + use t::util; my $util = t::util->new(); my $tmp_dir = $util->temp_directory(); -local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = q[t/data/samplesheet_1234.csv]; -# if REF_PATH is not set, force using ref defined in the header -local $ENV{REF_PATH} = $ENV{REF_PATH} ? $ENV{REF_PATH} : 'DUMMY'; - Log::Log4perl->easy_init({layout => '%d %-5p %c - %m%n', level => $DEBUG, file => join(q[/], $tmp_dir, 'logfile'), utf8 => 1}); -use_ok( q{npg_pipeline::function::seqchksum_comparator} ); +my $test_data_dir_47995 = 't/data/novaseqx/20231017_LH00210_0012_B22FCNFLT3'; + +sub _setup_runfolder_47995 { + my $timestamp = shift; + my @dirs = split q[/], $test_data_dir_47995; + my $rf_name = pop @dirs; + my $rf_info = $util->create_runfolder($tmp_dir, {'runfolder_name' => $rf_name}); + my $rf = $rf_info->{'runfolder_path'}; + for my $file (qw(RunInfo.xml RunParameters.xml)) { + if (copy("$test_data_dir_47995/$file", "$rf/$file") == 0) { + die "Failed to copy $file"; + } + } + my $bam_basecall_path = $rf . "/Data/Intensities/BAM_basecalls_$timestamp"; + my $archive_path = $bam_basecall_path . q{/no_cal/archive}; + make_path($archive_path); + $rf_info->{'bam_basecall_path'} = $bam_basecall_path; + $rf_info->{'archive_path'} = $archive_path; + return $rf_info; +} -$util->create_analysis(); +use_ok( q{npg_pipeline::function::seqchksum_comparator} ); my $timestamp = q{09-07-2009}; -my $analysis_runfolder_path = $util->analysis_runfolder_path(); -my $bam_basecall_path = $analysis_runfolder_path . "/Data/Intensities/BAM_basecalls_$timestamp/"; -my $recalibrated_path = $analysis_runfolder_path. "/Data/Intensities/BAM_basecalls_$timestamp/no_cal"; -my $archive_path = $recalibrated_path . q{/archive}; +my $rf_info = _setup_runfolder_47995($timestamp); +my $archive_path = $rf_info->{'archive_path'}; +my $bam_basecall_path = $rf_info->{'bam_basecall_path'}; my %init = ( - run_folder => q{123456_IL2_1234}, - runfolder_path => $analysis_runfolder_path, + runfolder_path => $rf_info->{'runfolder_path'}, archive_path => $archive_path, bam_basecall_path => $bam_basecall_path, - id_run => 1234, + id_run => 47995, is_indexed => 0, + timestamp => $timestamp, + lanes => [1,2], resource => { default => { minimum_cpu => 1, @@ -43,13 +61,11 @@ my %init = ( ); { + local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = "$test_data_dir_47995/samplesheet_47995.csv"; + my $object; lives_ok { - $object = npg_pipeline::function::seqchksum_comparator->new( - %init, - timestamp => $timestamp, - lanes => [1,2], - ); + $object = npg_pipeline::function::seqchksum_comparator->new(%init); } q{object ok}; isa_ok( $object, q{npg_pipeline::function::seqchksum_comparator}); @@ -60,83 +76,22 @@ my %init = ( is ($d->created_by, q{npg_pipeline::function::seqchksum_comparator}, 'created_by is correct'); is ($d->created_on, $object->timestamp, 'created_on is correct'); - is ($d->identifier, 1234, 'identifier is set correctly'); - ok (!$d->has_composition, 'no composition is not set'); - is ($d->job_name, q{seqchksum_comparator_1234_09-07-2009}, + is ($d->identifier, 47995, 'identifier is set correctly'); + ok (!$d->has_composition, 'composition is not set'); + is ($d->job_name, q{seqchksum_comparator_47995_09-07-2009}, 'job_name is correct'); my $rp = $object->recalibrated_path; is ($d->command, - q{npg_pipeline_seqchksum_comparator --id_run=1234 --lanes=1 --lanes=2} . + q{npg_pipeline_seqchksum_comparator --id_run=47995 --lanes=1 --lanes=2} . qq{ --archive_path=$archive_path --bam_basecall_path=$bam_basecall_path} . - qq{ --input_fofg_name=$rp/1234_input_fofn.txt}, + qq{ --input_fofg_name=$rp/47995_input_fofn.txt}, 'command is correct'); ok (!$d->excluded, 'step not excluded'); is ($d->queue, 'default', 'default queue'); lives_ok {$d->freeze()} 'definition can be serialized to JSON'; - throws_ok{$object->do_comparison()} qr/Failed to change directory/, + throws_ok{$object->do_comparison()} qr/Failed to run command seqchksum_merge.pl/, q{Doing a comparison with no files throws an exception}; - -############# -############# -############# -############# -## my $seqchksum_contents1 = <<'END1'; -## ### set count b_seq name_b_seq b_seq_qual b_seq_tags(BC,FI,QT,RT,TC) -## all all 19821774 3a58186f 29528f13 7bf272c0 30e0b9ef -## all pass 19821774 3a58186f 29528f13 7bf272c0 30e0b9ef -## all 0 1 1 1 1 -## pass 0 1 1 1 1 -## 1#0 all 3865560 4aebf9cb 63f4ad67 3d54f814 5c3f971f -## 1#0 pass 3865560 4aebf9cb 63f4ad67 3d54f814 5c3f971f -## 1#2 all 15956214 504ab7d8 28428e9b 643c096e 3cbf1e96 -## 1#2 pass 15956214 504ab7d8 28428e9b 643c096e 3cbf1e96}; -## END1 -## -## system "mkdir -p $archive_path/lane1"; -## system "cp -p t/data/runfolder/archive/lane1/1234_1#15.cram $archive_path/lane1"; -## -## system "mkdir -p $archive_path/lane2"; -## system "cp -p t/data/runfolder/archive/lane1/1234_1#15.cram $archive_path/lane2/1234_2#15.cram"; -## system "cp -p t/data/runfolder/archive/lane1/1234_1#15.seqchksum $archive_path/lane2/1234_2#15.seqchksum"; -## -## open my $seqchksum_fh1, '>', "$bam_basecall_path/1234_1.post_i2b.seqchksum" or die "Cannot open file for writing"; -## print $seqchksum_fh1 $seqchksum_contents1 or die $!; -## close $seqchksum_fh1 or die $!; -## -## SKIP: { -## skip 'no tools', 2 if ((not $ENV{TOOLS_INSTALLED}) and (system(q(which bamseqchksum)) or system(q(which scramble)))); -## TODO: { local $TODO= q(scramble doesn't through an exception when converting an empty bam file to cram it just writes a cram files with a @PG ID:scramble .. line); -## throws_ok{$object->do_comparison()} qr/Failed to run command bamcat /, q{Doing a comparison with empty bam files throws an exception}; -## } -## -## system "cp -p t/data/seqchksum/sorted.cram $archive_path/lane1/1234_1#15.cram"; -## system "cp -p t/data/seqchksum/sorted.cram $archive_path/lane2/1234_2#15.cram"; -## -## throws_ok { $object->do_comparison() } -## qr/seqchksum for post_i2b and product are different/, -## q{Doing a comparison with different bam files throws an exception}; -## } -############# -############# -############# -############# -} - -{ - my $object = npg_pipeline::function::seqchksum_comparator->new( - %init, - lanes => [1], - ); - my $da = $object->create(); - ok ($da && @{$da} == 1, 'an array with one definitions is returned'); - - $object = npg_pipeline::function::seqchksum_comparator->new( - %init - ); - $da = $object->create(); - # seqchksum_comparator is now a run-level function, so only one definition returned - ok ($da && @{$da} == 1, 'an array with one definition is returned for eight lanes'); } 1; diff --git a/t/20-function-start_stop.t b/t/20-function-start_stop.t index 62a8bbaa3..abe0f5775 100644 --- a/t/20-function-start_stop.t +++ b/t/20-function-start_stop.t @@ -1,14 +1,13 @@ use strict; use warnings; use Test::More tests => 3; -use Test::Exception; -use File::Copy; -use File::Basename; use t::util; my $util = t::util->new(); -my $runfolder_path = $util->analysis_runfolder_path(); +my $temp_dir = join q[/], $util->temp_directory(), 'analysis'; +my $rf_info = $util->create_runfolder($temp_dir); +my $runfolder_path = $rf_info->{'runfolder_path'}; use_ok('npg_pipeline::function::start_stop'); diff --git a/t/20-function-warehouse_archiver.t b/t/20-function-warehouse_archiver.t index 9f34f526b..e605ae622 100644 --- a/t/20-function-warehouse_archiver.t +++ b/t/20-function-warehouse_archiver.t @@ -40,6 +40,8 @@ for my $file (qw(RunInfo.xml RunParameters.xml)) { fmove($source, $target); } +local $ENV{'HOME'} = 't'; + use_ok('npg_pipeline::function::warehouse_archiver'); subtest 'warehouse updates' => sub { diff --git a/t/50-npg_pipeline-daemon-archival.t b/t/50-npg_pipeline-daemon-archival.t index d1b64341f..d4b36550a 100644 --- a/t/50-npg_pipeline-daemon-archival.t +++ b/t/50-npg_pipeline-daemon-archival.t @@ -13,6 +13,8 @@ use t::util; my $util = t::util->new(); my $temp_directory = $util->temp_directory(); +local $ENV{'HOME'} = 't'; + Log::Log4perl->easy_init({layout => '%d %-5p %c - %m%n', level => $DEBUG, file => join(q[/], $temp_directory, 'logfile'), diff --git a/t/data/bam_flagstats/1234_1_bam_flagstats.json b/t/data/bam_flagstats/1234_1_bam_flagstats.json deleted file mode 100644 index 649bfbad3..000000000 --- a/t/data/bam_flagstats/1234_1_bam_flagstats.json +++ /dev/null @@ -1 +0,0 @@ -{"paired_mapped_reads":10,"unpaired_read_duplicates":0,"position":"1","library":"170111_A1_kapaoriginal 1671118","histogram":{},"paired_read_duplicates":0,"info":{"Samtools":"0.1.11 (r851)","Picard-tools":"1.36"},"mate_mapped_defferent_chr":0,"unmapped_reads":7,"unpaired_mapped_reads":3,"id_run":"1234","__CLASS__":"npg_qc::autoqc::results::bam_flagstats-9824","proper_mapped_pair":0,"mate_mapped_defferent_chr_5":0,"read_pair_optical_duplicates":0,"human_split":"all"} diff --git a/t/data/bam_flagstats/1234_3_bam_flagstats.json b/t/data/bam_flagstats/1234_3_bam_flagstats.json deleted file mode 100644 index f7818f124..000000000 --- a/t/data/bam_flagstats/1234_3_bam_flagstats.json +++ /dev/null @@ -1 +0,0 @@ -{"paired_mapped_reads":4,"unpaired_read_duplicates":0,"position":"3","library":"170111_A1_kapaoriginal 1671118","histogram":{},"paired_read_duplicates":0,"info":{"Samtools":"0.1.11 (r851)","Picard-tools":"1.36"},"mate_mapped_defferent_chr":0,"unmapped_reads":6,"unpaired_mapped_reads":2,"id_run":"1234","__CLASS__":"npg_qc::autoqc::results::bam_flagstats-9824","proper_mapped_pair":0,"mate_mapped_defferent_chr_5":0,"read_pair_optical_duplicates":0,"human_split":"all"} diff --git a/t/data/bam_flagstats/1234_3_phix_bam_flagstats.json b/t/data/bam_flagstats/1234_3_phix_bam_flagstats.json deleted file mode 100644 index 64cf1a781..000000000 --- a/t/data/bam_flagstats/1234_3_phix_bam_flagstats.json +++ /dev/null @@ -1 +0,0 @@ -{"paired_mapped_reads":4,"unpaired_read_duplicates":0,"position":"3","library":"170111_A1_kapaoriginal 1671118","histogram":{},"paired_read_duplicates":0,"info":{"Samtools":"0.1.11 (r851)","Picard-tools":"1.36"},"mate_mapped_defferent_chr":0,"unmapped_reads":2,"unpaired_mapped_reads":4,"id_run":"1234","__CLASS__":"npg_qc::autoqc::results::bam_flagstats-9824","proper_mapped_pair":0,"mate_mapped_defferent_chr_5":0,"read_pair_optical_duplicates":0,"human_split":"phix"} diff --git a/t/data/bam_flagstats/1234_4_bam_flagstats.json b/t/data/bam_flagstats/1234_4_bam_flagstats.json deleted file mode 100644 index cca799d1a..000000000 --- a/t/data/bam_flagstats/1234_4_bam_flagstats.json +++ /dev/null @@ -1 +0,0 @@ -{"paired_mapped_reads":10,"unpaired_read_duplicates":0,"position":"4","library":"170111_A1_kapaoriginal 1671118","histogram":{},"paired_read_duplicates":0,"info":{"Samtools":"0.1.11 (r851)","Picard-tools":"1.36"},"mate_mapped_defferent_chr":0,"unmapped_reads":7,"unpaired_mapped_reads":3,"id_run":"1234","__CLASS__":"npg_qc::autoqc::results::bam_flagstats-9824","proper_mapped_pair":0,"mate_mapped_defferent_chr_5":0,"read_pair_optical_duplicates":0,"human_split":"all"} diff --git a/t/data/bam_flagstats/47995_3_bam_flagstats.json b/t/data/bam_flagstats/47995_3_bam_flagstats.json new file mode 100644 index 000000000..7eba2ac95 --- /dev/null +++ b/t/data/bam_flagstats/47995_3_bam_flagstats.json @@ -0,0 +1,21 @@ +{ + "paired_mapped_reads": 4, + "unpaired_read_duplicates": 0, + "position": "3", + "library": "170111_A1_kapaoriginal 1671118", + "histogram": {}, + "paired_read_duplicates": 0, + "info": { + "Samtools": "0.1.11 (r851)", + "Picard-tools": "1.36" + }, + "mate_mapped_defferent_chr": 0, + "unmapped_reads": 6, + "unpaired_mapped_reads": 2, + "id_run": "47995", + "__CLASS__": "npg_qc::autoqc::results::bam_flagstats-9824", + "proper_mapped_pair": 0, + "mate_mapped_defferent_chr_5": 0, + "read_pair_optical_duplicates": 0, + "human_split": "all" +} diff --git a/t/data/bam_flagstats/47995_3_phix_bam_flagstats.json b/t/data/bam_flagstats/47995_3_phix_bam_flagstats.json new file mode 100644 index 000000000..229174a2d --- /dev/null +++ b/t/data/bam_flagstats/47995_3_phix_bam_flagstats.json @@ -0,0 +1,21 @@ +{ + "paired_mapped_reads": 4, + "unpaired_read_duplicates": 0, + "position": "3", + "library": "170111_A1_kapaoriginal 1671118", + "histogram": {}, + "paired_read_duplicates": 0, + "info": { + "Samtools": "0.1.11 (r851)", + "Picard-tools": "1.36" + }, + "mate_mapped_defferent_chr": 0, + "unmapped_reads": 2, + "unpaired_mapped_reads": 4, + "id_run": "47995", + "__CLASS__": "npg_qc::autoqc::results::bam_flagstats-9824", + "proper_mapped_pair": 0, + "mate_mapped_defferent_chr_5": 0, + "read_pair_optical_duplicates": 0, + "human_split": "phix" +} diff --git a/t/data/miseq/24347_RunInfo.xml b/t/data/miseq/24347_RunInfo.xml new file mode 100755 index 000000000..00f9a978c --- /dev/null +++ b/t/data/miseq/24347_RunInfo.xml @@ -0,0 +1,14 @@ + + + + 000000000-BF5NJ + M02069 + 171114 + + + + + + + + diff --git a/t/data/p4_stage1_analysis/1234_samplesheet.csv b/t/data/p4_stage1_analysis/1234_samplesheet.csv deleted file mode 100644 index 6069d6ad8..000000000 --- a/t/data/p4_stage1_analysis/1234_samplesheet.csv +++ /dev/null @@ -1,27 +0,0 @@ -[Header],,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, -Investigator Name,pav,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, -Project Name,1000Genomes-A1-YRI,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, -Experiment Name,1234,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, -Date,2008-08-17T13:18:30,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, -Workflow,LibraryQC,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, -Chemistry,Default,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, -,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, -[Reads],,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, -37,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, -37,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, -[Settings],,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, -,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, -[Manifests],,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, -,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, -[Data],,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, -Index,Lane,Sample_ID,Sample_Name,GenomeFolder,bait_name,default_library_type,default_tag_sequence,email_addresses,email_addresses_of_followers,email_addresses_of_managers,email_addresses_of_owners,is_control,is_pool,lane_id,lane_priority,library_name,organism,organism_taxon_id,project_cost_code,project_id,project_name,qc_state,request_id,required_insert_size_range,sample_accession_number,sample_common_name,sample_consent_withdrawn,sample_description,sample_id,sample_name,sample_public_name,sample_reference_genome,spiked_phix_tag_index,study_accession_number,study_alignments_in_bam,study_contains_nonconsented_human,study_contains_nonconsented_xahuman,study_description,study_id,study_name,study_reference_genome,study_separate_y_chromosome_data,study_title,tag_index, -,1,51021,SRS000147,C:\Illumina\MiSeq Reporter\Genomes\WTSI_references\Homo_sapiens\1000Genomes\all\fasta\,,,,jws@sanger.ac.uk rd@sanger.ac.uk thomas.keane@sanger.ac.uk,jws@sanger.ac.uk thomas.keane@sanger.ac.uk,jws@sanger.ac.uk,rd@sanger.ac.uk,0,0,66206,0,NA18907-YRI-1,human,9606,S0706,185,1000Genomes-A1-YRI,pending,2409,from:150 to:200,SRS000147,Homo sapiens,,,766,NA18907-YRI-1,NA18907,,,SRP000031,1,0,0,1000Genomes Project Pilot 1,185,1000Genomes-A1-YRI,,,1000Genomes Project Pilot 1,, -,2,51021,SRS000147,C:\Illumina\MiSeq Reporter\Genomes\WTSI_references\Homo_sapiens\1000Genomes\all\fasta\,,,,jws@sanger.ac.uk rd@sanger.ac.uk thomas.keane@sanger.ac.uk,jws@sanger.ac.uk thomas.keane@sanger.ac.uk,jws@sanger.ac.uk,rd@sanger.ac.uk,0,0,66207,0,NA18907-YRI-1,human,9606,S0706,185,1000Genomes-A1-YRI,pending,2410,from:150 to:200,SRS000147,Homo sapiens,,,766,NA18907-YRI-1,NA18907,,,SRP000031,1,0,0,1000Genomes Project Pilot 1,185,1000Genomes-A1-YRI,,,1000Genomes Project Pilot 1,, -,3,51021,SRS000147,C:\Illumina\MiSeq Reporter\Genomes\WTSI_references\Homo_sapiens\1000Genomes\all\fasta\,,,,jws@sanger.ac.uk rd@sanger.ac.uk thomas.keane@sanger.ac.uk,jws@sanger.ac.uk thomas.keane@sanger.ac.uk,jws@sanger.ac.uk,rd@sanger.ac.uk,0,0,66208,0,NA18907-YRI-1,human,9606,S0706,185,1000Genomes-A1-YRI,pending,2411,from:150 to:200,SRS000147,Homo sapiens,,,766,NA18907-YRI-1,NA18907,,,SRP000031,1,0,0,1000Genomes Project Pilot 1,185,1000Genomes-A1-YRI,,,1000Genomes Project Pilot 1,, -,4,79570,phiX_SI_SPRI,C:\Illumina\MiSeq Reporter\Genomes\WTSI_references\PhiX\Sanger-SNPs\all\fasta\,,,,,,,,1,0,80723,0,phiX_SI_SPRI,,,,,,,41944,,,,,,9829,phiX_SI_SPRI,,,,,,0,0,,,,,,,, -,5,51021,SRS000147,C:\Illumina\MiSeq Reporter\Genomes\WTSI_references\Homo_sapiens\1000Genomes\all\fasta\,,,,jws@sanger.ac.uk rd@sanger.ac.uk thomas.keane@sanger.ac.uk,jws@sanger.ac.uk thomas.keane@sanger.ac.uk,jws@sanger.ac.uk,rd@sanger.ac.uk,0,0,66209,0,NA18907-YRI-1,human,9606,S0706,185,1000Genomes-A1-YRI,pending,2412,from:150 to:200,SRS000147,Homo sapiens,,,766,NA18907-YRI-1,NA18907,,,SRP000031,1,0,0,1000Genomes Project Pilot 1,185,1000Genomes-A1-YRI,,,1000Genomes Project Pilot 1,, -,6,51021,SRS000147,C:\Illumina\MiSeq Reporter\Genomes\WTSI_references\Homo_sapiens\1000Genomes\all\fasta\,,,,jws@sanger.ac.uk rd@sanger.ac.uk thomas.keane@sanger.ac.uk,jws@sanger.ac.uk thomas.keane@sanger.ac.uk,jws@sanger.ac.uk,rd@sanger.ac.uk,0,0,66210,0,NA18907-YRI-1,human,9606,S0706,185,1000Genomes-A1-YRI,pending,2413,from:150 to:200,SRS000147,Homo sapiens,,,766,NA18907-YRI-1,NA18907,,,SRP000031,1,0,0,1000Genomes Project Pilot 1,185,1000Genomes-A1-YRI,,,1000Genomes Project Pilot 1,, -,7,51021,SRS000147,C:\Illumina\MiSeq Reporter\Genomes\WTSI_references\Homo_sapiens\1000Genomes\all\fasta\,,,,jws@sanger.ac.uk rd@sanger.ac.uk thomas.keane@sanger.ac.uk,jws@sanger.ac.uk thomas.keane@sanger.ac.uk,jws@sanger.ac.uk,rd@sanger.ac.uk,0,0,66211,0,NA18907-YRI-1,human,9606,S0706,185,1000Genomes-A1-YRI,pending,2414,from:150 to:200,SRS000147,Homo sapiens,,,766,NA18907-YRI-1,NA18907,,,SRP000031,1,0,0,1000Genomes Project Pilot 1,185,1000Genomes-A1-YRI,,,1000Genomes Project Pilot 1,, -ATCAACCG,8,51021,SRS000147,C:\Illumina\MiSeq Reporter\Genomes\WTSI_references\Homo_sapiens\1000Genomes\all\fasta\,,,ATCAACCG,jws@sanger.ac.uk rd@sanger.ac.uk thomas.keane@sanger.ac.uk,jws@sanger.ac.uk thomas.keane@sanger.ac.uk,jws@sanger.ac.uk,rd@sanger.ac.uk,0,1,66212,0,NA18907-YRI-1,human,9606,S0706,185,1000Genomes-A1-YRI,pending,2415,from:150 to:200,SRS000147,Homo sapiens,,,766,NA18907-YRI-1,NA18907,,,SRP000031,1,0,0,1000Genomes Project Pilot 1,185,1000Genomes-A1-YRI,,,1000Genomes Project Pilot 1,154, -TAGTTGGC,8,51022,SRS000148,C:\Illumina\MiSeq Reporter\Genomes\WTSI_references\Homo_sapiens\1000Genomes\all\fasta\,,,TAGTTGGC,jws@sanger.ac.uk rd@sanger.ac.uk thomas.keane@sanger.ac.uk,jws@sanger.ac.uk thomas.keane@sanger.ac.uk,jws@sanger.ac.uk,rd@sanger.ac.uk,0,1,66212,0,NA18908-YRI-1,human,9606,S0706,185,1000Genomes-A1-YRI,pending,2415,from:150 to:200,SRS000148,Homo sapiens,,,767,NA18908-YRI-1,NA18908,,,SRP000031,1,0,0,1000Genomes Project Pilot 1,185,1000Genomes-A1-YRI,,,1000Genomes Project Pilot 1,155, - diff --git a/t/data/p4_stage1_analysis/samplesheet.csv b/t/data/p4_stage1_analysis/samplesheet.csv new file mode 100644 index 000000000..41a6d28f8 --- /dev/null +++ b/t/data/p4_stage1_analysis/samplesheet.csv @@ -0,0 +1,4 @@ +[Data],,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, +Index,Lane,Sample_ID,Sample_Name,GenomeFolder,bait_name,default_library_type,default_tag_sequence,email_addresses,email_addresses_of_followers,email_addresses_of_managers,email_addresses_of_owners,is_control,is_pool,lane_id,lane_priority,library_name,organism,organism_taxon_id,project_cost_code,project_id,project_name,qc_state,request_id,required_insert_size_range,sample_accession_number,sample_common_name,sample_consent_withdrawn,sample_description,sample_id,sample_name,sample_public_name,sample_reference_genome,spiked_phix_tag_index,study_accession_number,study_alignments_in_bam,study_contains_nonconsented_human,study_contains_nonconsented_xahuman,study_description,study_id,study_name,study_reference_genome,study_separate_y_chromosome_data,study_title,tag_index, +,1,51021,SRS000147,C:\Illumina\MiSeq Reporter\Genomes\WTSI_references\Homo_sapiens\1000Genomes\all\fasta\,,,,jws@sanger.ac.uk rd@sanger.ac.uk thomas.keane@sanger.ac.uk,jws@sanger.ac.uk thomas.keane@sanger.ac.uk,jws@sanger.ac.uk,rd@sanger.ac.uk,0,0,66206,0,NA18907-YRI-1,human,9606,S0706,185,1000Genomes-A1-YRI,pending,2409,from:150 to:200,SRS000147,Homo sapiens,,,766,NA18907-YRI-1,NA18907,,,SRP000031,1,0,0,1000Genomes Project Pilot 1,185,1000Genomes-A1-YRI,,,1000Genomes Project Pilot 1,, + diff --git a/t/data/runfolder/Data/RunInfo.xml b/t/data/runfolder/Data/RunInfo.xml deleted file mode 100644 index c4d34c914..000000000 --- a/t/data/runfolder/Data/RunInfo.xml +++ /dev/null @@ -1,17 +0,0 @@ - - - - D0W73ACXX - SN510 - 120611 - - - - - - - - 4 - - - \ No newline at end of file diff --git a/t/data/runfolder/archive/1234_2.bam b/t/data/runfolder/archive/1234_2.bam deleted file mode 100644 index e69de29bb..000000000 diff --git a/t/data/runfolder/archive/1234_2_human.bam b/t/data/runfolder/archive/1234_2_human.bam deleted file mode 100644 index e69de29bb..000000000 diff --git a/t/data/runfolder/archive/1234_3.bam b/t/data/runfolder/archive/1234_3.bam deleted file mode 100644 index e69de29bb..000000000 diff --git a/t/data/runfolder/archive/lane1/1234_1#15.bam b/t/data/runfolder/archive/lane1/1234_1#15.bam deleted file mode 100644 index e69de29bb..000000000 diff --git a/t/data/runfolder/archive/lane1/1234_1#15.cram b/t/data/runfolder/archive/lane1/1234_1#15.cram deleted file mode 100644 index 7d3d1131b..000000000 Binary files a/t/data/runfolder/archive/lane1/1234_1#15.cram and /dev/null differ diff --git a/t/data/runfolder/archive/lane1/1234_1#15.seqchksum b/t/data/runfolder/archive/lane1/1234_1#15.seqchksum deleted file mode 100644 index 8a269c7e6..000000000 --- a/t/data/runfolder/archive/lane1/1234_1#15.seqchksum +++ /dev/null @@ -1,3 +0,0 @@ -### set count b_seq name_b_seq b_seq_qual b_seq_tags(BC,FI,QT,RT,TC) -all all 0 1 1 1 1 -all pass 0 1 1 1 1 diff --git a/t/data/runfolder/archive/lane1/1234_1#15_human.bam b/t/data/runfolder/archive/lane1/1234_1#15_human.bam deleted file mode 100644 index e69de29bb..000000000 diff --git a/t/data/runfolder/archive/lane1/1234_1#15_phix.bam b/t/data/runfolder/archive/lane1/1234_1#15_phix.bam deleted file mode 100644 index e69de29bb..000000000 diff --git a/t/data/runfolder/archive/lane4/1234_4#16.bam b/t/data/runfolder/archive/lane4/1234_4#16.bam deleted file mode 100644 index e69de29bb..000000000 diff --git a/t/data/runfolder/archive/lane4/1234_4#32.bam b/t/data/runfolder/archive/lane4/1234_4#32.bam deleted file mode 100644 index e69de29bb..000000000 diff --git a/t/util.pm b/t/util.pm index 0f688bf55..2f7755b9c 100644 --- a/t/util.pm +++ b/t/util.pm @@ -1,12 +1,10 @@ package t::util; use Moose; -use File::Temp qw{ tempdir }; +use File::Temp qw(tempdir); use Readonly; use File::Path qw(make_path); -Readonly::Scalar my $NFS_STAGING_DISK => q{/nfs/sf45}; - has q{temp_directory} => ( isa => q{Str}, is => q{ro}, @@ -24,49 +22,6 @@ has q{clean_temp_directory} => ( default => 1, ); -############### -# path setups - -Readonly::Scalar our $DEFAULT_RUNFOLDER => q{123456_IL2_1234}; -Readonly::Scalar our $ANALYSIS_RUNFOLDER_PATH => $NFS_STAGING_DISK . q{/IL2/analysis/} . $DEFAULT_RUNFOLDER; -Readonly::Scalar our $BBCALLS_PATH => qq{$ANALYSIS_RUNFOLDER_PATH/Data/Intensities/BAM_basecalls_09-07-2009}; -Readonly::Scalar our $RECALIBRATED_PATH => qq{$BBCALLS_PATH/no_cal}; - -sub analysis_runfolder_path { - my ( $self ) = @_; - return $self->temp_directory() . $ANALYSIS_RUNFOLDER_PATH; -} - -sub standard_bam_basecall_path { - my ( $self ) = @_; - return $self->temp_directory() . $BBCALLS_PATH; -} - -sub standard_analysis_recalibrated_path { - my ( $self ) = @_; - return $self->temp_directory() . $RECALIBRATED_PATH; -} - -sub create_analysis { - my ($self) = @_; - $self->remove_staging(); - my $analysis_runfolder_path = $self->temp_directory() . $ANALYSIS_RUNFOLDER_PATH; - my $recalibrated_path = $self->temp_directory() . $RECALIBRATED_PATH; - `mkdir -p $recalibrated_path`; - `ln -s Data/Intensities/BAM_basecalls_09-07-2009/no_cal $analysis_runfolder_path/Latest_Summary`; - `mkdir -p $analysis_runfolder_path/InterOp`; - `cp t/data/p4_stage1_analysis/TileMetricsOut.bin $analysis_runfolder_path/InterOp`; - `cp t/data/run_params/runParameters.miseq.xml $analysis_runfolder_path/runParameters.xml`; - return; -} - -sub remove_staging { - my ($self) = @_; - my $staging = $self->temp_directory() . $NFS_STAGING_DISK; - `rm -rf $staging`; - return 1; -} - sub create_runfolder { my ($self, $dir, $names) = @_; @@ -90,29 +45,4 @@ sub create_runfolder { return $paths; } -sub create_run_info { - my ($self, $reads_wanted) = @_; - - my $default_reads_wanted = q[ ]; - - my $reads = ( defined $reads_wanted ) ? $reads_wanted : $default_reads_wanted; - - my $fh; - my $runinfofile = $self->analysis_runfolder_path() . q[/RunInfo.xml]; - open($fh, '>', $runinfofile) or die "Could not open file '$runinfofile' $!"; - print $fh <<"ENDXML"; - - - - -$reads - - - - - -ENDXML - close $fh; -} - 1;