From f52636ef43897c1a987bf884da0745c4819dbe40 Mon Sep 17 00:00:00 2001 From: Marina Gourtovaia Date: Fri, 10 May 2024 09:57:10 +0100 Subject: [PATCH 01/10] Consolidated code for creating test runfolder --- t/10-pluggable-central.t | 1 - t/15-pipeline_launcher_scripts.t | 2 -- t/util.pm | 24 +++++++++--------------- 3 files changed, 9 insertions(+), 18 deletions(-) diff --git a/t/10-pluggable-central.t b/t/10-pluggable-central.t index 8d097660..eab28009 100644 --- a/t/10-pluggable-central.t +++ b/t/10-pluggable-central.t @@ -65,7 +65,6 @@ my $runfolder_path = $util->analysis_runfolder_path(); cp 't/data/run_params/runParameters.hiseq.xml', join(q[/], $runfolder_path, 'runParameters.xml'); - $util->create_run_info(); my $config_dir = 'data/config_files'; my $init = { function_order => [qw{qc_qX_yield qc_adapter update_ml_warehouse qc_insert_size}], diff --git a/t/15-pipeline_launcher_scripts.t b/t/15-pipeline_launcher_scripts.t index 24a89c8c..4c77fcb2 100644 --- a/t/15-pipeline_launcher_scripts.t +++ b/t/15-pipeline_launcher_scripts.t @@ -40,7 +40,6 @@ my $product_config = q[t/data/release/config/archive_on/product_release.yml]; { local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = q{t/data/samplesheet_1234.csv}; $util->create_analysis(); - $util->create_run_info(); my $out = `$bin/npg_pipeline_central --product_conf_file_path $product_config --no-spider --no_bsub --no_sf_resource --runfolder_path $rf --bam_basecall_path $bbp --function_order dodo 2>&1`; like($out, @@ -71,7 +70,6 @@ my $product_config = q[t/data/release/config/archive_on/product_release.yml]; { $util->create_analysis(); - $util->create_run_info(); lives_ok { qx{$bin/npg_pipeline_seqchksum_comparator --id_run=1234 --archive_path=$rf/Data/Intensities/BAM_basecalls_20140815-114817/no_cal/archive --bam_basecall_path=$rf/Data/Intensities/BAM_basecalls_20140815-114817 --lanes=1 };} q{ran bin/npg_pipeline_seqchksum_comparator with analysis and bam_basecall_path}; ok($CHILD_ERROR, qq{Return code of $CHILD_ERROR as no files found}); diff --git a/t/util.pm b/t/util.pm index 0f688bf5..a6e20827 100644 --- a/t/util.pm +++ b/t/util.pm @@ -49,7 +49,10 @@ sub standard_analysis_recalibrated_path { sub create_analysis { my ($self) = @_; - $self->remove_staging(); + + my $staging = $self->temp_directory() . $NFS_STAGING_DISK; + `rm -rf $staging`; + my $analysis_runfolder_path = $self->temp_directory() . $ANALYSIS_RUNFOLDER_PATH; my $recalibrated_path = $self->temp_directory() . $RECALIBRATED_PATH; `mkdir -p $recalibrated_path`; @@ -57,14 +60,9 @@ sub create_analysis { `mkdir -p $analysis_runfolder_path/InterOp`; `cp t/data/p4_stage1_analysis/TileMetricsOut.bin $analysis_runfolder_path/InterOp`; `cp t/data/run_params/runParameters.miseq.xml $analysis_runfolder_path/runParameters.xml`; - return; -} + $self->_create_run_info(); -sub remove_staging { - my ($self) = @_; - my $staging = $self->temp_directory() . $NFS_STAGING_DISK; - `rm -rf $staging`; - return 1; + return; } sub create_runfolder { @@ -90,12 +88,8 @@ sub create_runfolder { return $paths; } -sub create_run_info { - my ($self, $reads_wanted) = @_; - - my $default_reads_wanted = q[ ]; - - my $reads = ( defined $reads_wanted ) ? $reads_wanted : $default_reads_wanted; +sub _create_run_info { + my ($self) = @_; my $fh; my $runinfofile = $self->analysis_runfolder_path() . q[/RunInfo.xml]; @@ -105,7 +99,7 @@ sub create_run_info { -$reads + From 1eeade8ac52ac0eea35409011e785af30f6deff0 Mon Sep 17 00:00:00 2001 From: Marina Gourtovaia Date: Fri, 10 May 2024 15:10:20 +0100 Subject: [PATCH 02/10] Reset HOME to prevent silent access to databases. --- t/10-pluggable-central.t | 1 + t/10-pluggable.t | 1 + t/15-pipeline_launcher_scripts.t | 1 + t/20-function-warehouse_archiver.t | 2 ++ t/50-npg_pipeline-daemon-archival.t | 2 ++ 5 files changed, 7 insertions(+) diff --git a/t/10-pluggable-central.t b/t/10-pluggable-central.t index eab28009..93119c8f 100644 --- a/t/10-pluggable-central.t +++ b/t/10-pluggable-central.t @@ -18,6 +18,7 @@ foreach my $tool (@tools) { } chmod 0755, @tools; local $ENV{'PATH'} = join q[:], $tdir, $ENV{'PATH'}; +local $ENV{'HOME'} = 't'; my $product_config = q[t/data/release/config/archive_on/product_release.yml]; diff --git a/t/10-pluggable.t b/t/10-pluggable.t index f85967a9..c6b38330 100644 --- a/t/10-pluggable.t +++ b/t/10-pluggable.t @@ -23,6 +23,7 @@ foreach my $tool (@tools) { } chmod 0755, @tools; local $ENV{'PATH'} = join q[:], $test_bin, $ENV{'PATH'}; +local $ENV{'HOME'} = 't'; Log::Log4perl->easy_init({layout => '%d %-5p %c - %m%n', level => $DEBUG, diff --git a/t/15-pipeline_launcher_scripts.t b/t/15-pipeline_launcher_scripts.t index 4c77fcb2..27ef986a 100644 --- a/t/15-pipeline_launcher_scripts.t +++ b/t/15-pipeline_launcher_scripts.t @@ -23,6 +23,7 @@ chmod 0755, @tools; local $ENV{'PATH'} = join q[:], $tmp_dir, $bin, $ENV{'PATH'}; local $ENV{'http_proxy'} = q[http://wibble]; local $ENV{'no_proxy'} = q[]; +local $ENV{'HOME'} = 't'; my $rf = $util->analysis_runfolder_path; my $bbp = "$rf/bam_basecall_path"; diff --git a/t/20-function-warehouse_archiver.t b/t/20-function-warehouse_archiver.t index 9f34f526..e605ae62 100644 --- a/t/20-function-warehouse_archiver.t +++ b/t/20-function-warehouse_archiver.t @@ -40,6 +40,8 @@ for my $file (qw(RunInfo.xml RunParameters.xml)) { fmove($source, $target); } +local $ENV{'HOME'} = 't'; + use_ok('npg_pipeline::function::warehouse_archiver'); subtest 'warehouse updates' => sub { diff --git a/t/50-npg_pipeline-daemon-archival.t b/t/50-npg_pipeline-daemon-archival.t index d1b64341..d4b36550 100644 --- a/t/50-npg_pipeline-daemon-archival.t +++ b/t/50-npg_pipeline-daemon-archival.t @@ -13,6 +13,8 @@ use t::util; my $util = t::util->new(); my $temp_directory = $util->temp_directory(); +local $ENV{'HOME'} = 't'; + Log::Log4perl->easy_init({layout => '%d %-5p %c - %m%n', level => $DEBUG, file => join(q[/], $temp_directory, 'logfile'), From d6c9a0cac09ca2dd7b533aab4a98c58ad97e2cce Mon Sep 17 00:00:00 2001 From: Marina Gourtovaia Date: Fri, 10 May 2024 17:08:30 +0100 Subject: [PATCH 03/10] Test for scripts now use NovaSeqX runfolder. --- t/15-pipeline_launcher_scripts.t | 124 +++++++++++++++++++------------ 1 file changed, 78 insertions(+), 46 deletions(-) diff --git a/t/15-pipeline_launcher_scripts.t b/t/15-pipeline_launcher_scripts.t index 27ef986a..5e6dabeb 100644 --- a/t/15-pipeline_launcher_scripts.t +++ b/t/15-pipeline_launcher_scripts.t @@ -1,10 +1,12 @@ use strict; use warnings; -use English qw{-no_match_vars}; -use Test::More tests => 10; +use English qw(-no_match_vars); +use Test::More tests => 4; use Test::Exception; use File::Copy; use Cwd; +use File::Temp qw(tempdir); +use File::Path qw(make_path); use t::util; @@ -20,68 +22,98 @@ foreach my $tool (@tools) { } chmod 0755, @tools; -local $ENV{'PATH'} = join q[:], $tmp_dir, $bin, $ENV{'PATH'}; -local $ENV{'http_proxy'} = q[http://wibble]; -local $ENV{'no_proxy'} = q[]; -local $ENV{'HOME'} = 't'; +local $ENV{'PATH'} = join q[:], $tmp_dir, $bin, $ENV{'PATH'}; +local $ENV{'HOME'} = 't'; -my $rf = $util->analysis_runfolder_path; -my $bbp = "$rf/bam_basecall_path"; my $product_config = q[t/data/release/config/archive_on/product_release.yml]; +my $test_data_dir_47995 = 't/data/novaseqx/20231017_LH00210_0012_B22FCNFLT3'; + +sub _setup_runfolder_47995 { + my $tmp_dir = tempdir(CLEANUP => 1); + my @dirs = split q[/], $test_data_dir_47995; + my $rf_name = pop @dirs; + my $rf_info = $util->create_runfolder($tmp_dir, {'runfolder_name' => $rf_name}); + my $rf = $rf_info->{'runfolder_path'}; + for my $file (qw(RunInfo.xml RunParameters.xml)) { + if (copy("$test_data_dir_47995/$file", "$rf/$file") == 0) { + die "Failed to copy $file"; + } + } + return $rf_info; +} { + my $rf_info = _setup_runfolder_47995(); + my $rf = $rf_info->{'runfolder_path'}; local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = q{/does/not/exist.csv}; - $util->create_analysis(); - my $out = `$bin/npg_pipeline_central --product_conf_file_path $product_config --spider --no_bsub --no_sf_resource --runfolder_path $rf --function_order dodo 2>&1`; - like($out, - qr/Error initializing pipeline: Error while spidering/, - 'error in spidering when pre-set samplesheet does not exist'); + my $command = "$bin/npg_pipeline_central " . + "--product_conf_file_path $product_config --spider --no_bsub " . + "--runfolder_path $rf --function_order dodo 2>&1"; + note "Executing $command"; + like(`$command`, qr/Error initializing pipeline: Error while spidering/, + 'error in spidering when pre-set samplesheet does not exist'); } -{ - local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = q{t/data/samplesheet_1234.csv}; - $util->create_analysis(); +subtest 'test analysis and archival pipeline scripts' => sub { + plan tests => 5; - my $out = `$bin/npg_pipeline_central --product_conf_file_path $product_config --no-spider --no_bsub --no_sf_resource --runfolder_path $rf --bam_basecall_path $bbp --function_order dodo 2>&1`; - like($out, - qr/Function dodo cannot be found in the graph/, - 'error when function does not exist'); -} + # A full run folder is scaffolded by the analysis pipeline. + # The archival pipeline is using teh same run folder. + + my $rf_info = _setup_runfolder_47995(); + my $rf = $rf_info->{'runfolder_path'}; + local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = "$test_data_dir_47995/samplesheet_47995.csv"; + my $command = "$bin/npg_pipeline_central " . + "--product_conf_file_path $product_config --no-spider --no_bsub " . + "--runfolder_path $rf --function_order create_summary_link_analysis " . + "--function_order dodo 2>&1"; + note "Executing $command"; + like(`$command`, qr/Function dodo cannot be found in the graph/, + 'error when function does not exist'); -{ - local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = q{t/data/samplesheet_1234.csv}; my $config_dir = join q[/], $tmp_dir, 'config'; - mkdir $config_dir; + make_path($config_dir); my @files = glob 'data/config_files/*.{json,ini}'; push @files, 't/data/release/config/archive_on/product_release.yml'; for (@files) { copy $_, $config_dir; } - lives_ok { qx{ - $bin/npg_pipeline_post_qc_review --no_bsub --no_sf_resource --runfolder_path $rf --bam_basecall_path $bbp --conf_path $config_dir};} - q{ran bin/npg_pipeline_post_qc_review}; - ok(!$CHILD_ERROR, qq{Return code of $CHILD_ERROR}); - - lives_ok { qx{ - $bin/npg_pipeline_post_qc_review --no_bsub --no_sf_resource --runfolder_path $rf --bam_basecall_path $bbp --function_list some --conf_path $config_dir}; } - q{ran bin/npg_pipeline_post_qc_review with non-exisiting function list}; - ok($CHILD_ERROR, qq{Child error $CHILD_ERROR}); -} - -{ - $util->create_analysis(); - - lives_ok { qx{$bin/npg_pipeline_seqchksum_comparator --id_run=1234 --archive_path=$rf/Data/Intensities/BAM_basecalls_20140815-114817/no_cal/archive --bam_basecall_path=$rf/Data/Intensities/BAM_basecalls_20140815-114817 --lanes=1 };} q{ran bin/npg_pipeline_seqchksum_comparator with analysis and bam_basecall_path}; + $command = "$bin/npg_pipeline_post_qc_review --no_bsub --runfolder_path $rf " . + "--conf_path $config_dir"; + note "Executing $command"; + lives_ok { `$command` } 'ran bin/npg_pipeline_post_qc_review'; + ok(!$CHILD_ERROR, 'No error running command'); + + $command = "$bin/npg_pipeline_post_qc_review --no_bsub --runfolder_path $rf " . + "--conf_path $config_dir --function_list some"; + note "Executing $command"; + lives_ok { `$command` } + 'ran bin/npg_pipeline_post_qc_review with non-exisiting function list'; + ok($CHILD_ERROR, "Child error $CHILD_ERROR"); +}; + +subtest 'test npg_pipeline_seqchksum_comparator script' => sub { + plan tests => 2; + my $rf_info = _setup_runfolder_47995(); + my $rf = $rf_info->{'runfolder_path'}; + my $bbc = "$rf/Data/Intensities/BAM_basecalls"; + my $apath = "$bbc/no_cal/archive"; + make_path($apath); + my $command = "$bin/npg_pipeline_seqchksum_comparator --id_run=1234 " . + "--archive_path=$apath --bam_basecall_path=$bbc --lanes=1"; + note "Executing $command"; + lives_ok { `$command` } + 'ran npg_pipeline_seqchksum_comparator with analysis and bam_basecall_path'; ok($CHILD_ERROR, qq{Return code of $CHILD_ERROR as no files found}); -} +}; -{ +subtest 'test npg_pipeline_preexec_references script' => sub { + plan tests => 2; `bin/npg_pipeline_preexec_references --repository t/data/sequence/refs 2>/dev/null`; - ok( $CHILD_ERROR, qq{failed as could not locate references directory - $CHILD_ERROR} ); - - qx{bin/npg_pipeline_preexec_references --repository t/data/sequence}; - ok( ! $CHILD_ERROR, q{script runs OK} ); -} + ok($CHILD_ERROR, "failed as could not locate references directory - $CHILD_ERROR"); + `bin/npg_pipeline_preexec_references --repository t/data/sequence`; + ok(! $CHILD_ERROR, 'script runs OK'); +}; 1; From abb4bf2660d361337bd7b6a21854640ccf78e87a Mon Sep 17 00:00:00 2001 From: Marina Gourtovaia Date: Fri, 10 May 2024 22:42:59 +0100 Subject: [PATCH 04/10] Replaced old-style runfolders in all tests. --- MANIFEST | 10 +- t/10-pluggable-central.t | 114 +++---- t/20-function-cluster_count.t | 227 +++++++------ t/20-function-p4_stage1_analysis.t | 303 ++++++++---------- t/20-function-seqchksum_comparator.t | 127 +++----- t/20-function-start_stop.t | 7 +- .../bam_flagstats/1234_1_bam_flagstats.json | 1 - .../bam_flagstats/1234_3_bam_flagstats.json | 1 - .../1234_3_phix_bam_flagstats.json | 1 - .../bam_flagstats/1234_4_bam_flagstats.json | 1 - .../bam_flagstats/47995_3_bam_flagstats.json | 21 ++ .../47995_3_phix_bam_flagstats.json | 21 ++ t/data/miseq/24347_RunInfo.xml | 14 + .../p4_stage1_analysis/1234_samplesheet.csv | 27 -- t/data/p4_stage1_analysis/samplesheet.csv | 4 + t/data/runfolder/Data/RunInfo.xml | 17 - t/util.pm | 66 +--- 17 files changed, 430 insertions(+), 532 deletions(-) delete mode 100644 t/data/bam_flagstats/1234_1_bam_flagstats.json delete mode 100644 t/data/bam_flagstats/1234_3_bam_flagstats.json delete mode 100644 t/data/bam_flagstats/1234_3_phix_bam_flagstats.json delete mode 100644 t/data/bam_flagstats/1234_4_bam_flagstats.json create mode 100644 t/data/bam_flagstats/47995_3_bam_flagstats.json create mode 100644 t/data/bam_flagstats/47995_3_phix_bam_flagstats.json create mode 100755 t/data/miseq/24347_RunInfo.xml delete mode 100644 t/data/p4_stage1_analysis/1234_samplesheet.csv create mode 100644 t/data/p4_stage1_analysis/samplesheet.csv delete mode 100644 t/data/runfolder/Data/RunInfo.xml diff --git a/MANIFEST b/MANIFEST index 7b130523..40045723 100644 --- a/MANIFEST +++ b/MANIFEST @@ -124,10 +124,8 @@ t/50-npg_pipeline-daemon-archival.t t/bin/bkill t/bin/bresume t/bin/bsub -t/data/bam_flagstats/1234_1_bam_flagstats.json -t/data/bam_flagstats/1234_3_bam_flagstats.json -t/data/bam_flagstats/1234_3_phix_bam_flagstats.json -t/data/bam_flagstats/1234_4_bam_flagstats.json +t/data/bam_flagstats/47995_3_bam_flagstats.json +t/data/bam_flagstats/47995_3_phix_bam_flagstats.json t/data/barcodes/samplesheet_batch2015.csv t/data/barcodes/samplesheet_batch42225.csv t/data/barcodes/samplesheet_batch42225_amended1.csv @@ -229,6 +227,7 @@ t/data/miseq/16850_runParameters.xml t/data/miseq/16866_RunInfo.xml t/data/miseq/20990_RunInfo.xml t/data/miseq/24135_RunInfo.xml +t/data/miseq/24347_RunInfo.xml t/data/miseq/samplesheet_16850.csv t/data/miseq/samplesheet_16866.csv t/data/miseq/samplesheet_20990.csv @@ -1011,7 +1010,7 @@ t/data/novaseqx/47539/samplesheet_47539.csv t/data/novaseqx/20231017_LH00210_0012_B22FCNFLT3/RunInfo.xml t/data/novaseqx/20231017_LH00210_0012_B22FCNFLT3/RunParameters.xml t/data/novaseqx/20231017_LH00210_0012_B22FCNFLT3/samplesheet_47995.csv -t/data/p4_stage1_analysis/1234_samplesheet.csv +t/data/p4_stage1_analysis/samplesheet.csv t/data/p4_stage1_analysis/TileMetricsOut.bin t/data/portable_pipelines/ncov2019-artic-nf/cf01166c42a/product_release.yml t/data/portable_pipelines/ncov2019-artic-nf/cf01166c42a/product_release_no_pp.yml @@ -1071,7 +1070,6 @@ t/data/runfolder/archive/lane1/1234_1#15_human.bam t/data/runfolder/archive/lane1/1234_1#15_phix.bam t/data/runfolder/archive/lane4/1234_4#16.bam t/data/runfolder/archive/lane4/1234_4#32.bam -t/data/runfolder/Data/RunInfo.xml t/data/samplesheet_1234.csv t/data/samplesheet_8747.csv t/data/samplesheet_33990.csv diff --git a/t/10-pluggable-central.t b/t/10-pluggable-central.t index 93119c8f..2afdb079 100644 --- a/t/10-pluggable-central.t +++ b/t/10-pluggable-central.t @@ -1,10 +1,11 @@ use strict; use warnings; -use Test::More tests => 22; +use Test::More tests => 4; use Test::Exception; use Log::Log4perl qw(:levels); use File::Copy qw(cp); use File::Path qw(make_path); +use File::Temp qw(tempdir); use t::util; @@ -27,73 +28,82 @@ Log::Log4perl->easy_init({layout => '%d %-5p %c - %m%n', file => join(q[/], $tdir, 'logfile'), utf8 => 1}); +my $test_data_dir_47995 = 't/data/novaseqx/20231017_LH00210_0012_B22FCNFLT3'; +sub _setup_runfolder_47995 { + my $tmp_dir = tempdir(CLEANUP => 1); + my @dirs = split q[/], $test_data_dir_47995; + my $rf_name = pop @dirs; + my $rf_info = $util->create_runfolder($tmp_dir, {'runfolder_name' => $rf_name}); + my $rf = $rf_info->{'runfolder_path'}; + for my $file (qw(RunInfo.xml RunParameters.xml)) { + if (cp("$test_data_dir_47995/$file", "$rf/$file") == 0) { + die "Failed to copy $file"; + } + } + return $rf_info; +} + + my $central = q{npg_pipeline::pluggable::central}; use_ok($central); -my $runfolder_path = $util->analysis_runfolder_path(); +subtest 'test object creation' => sub { + plan tests => 4; -{ - local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = q[t/data/samplesheet_1234.csv]; - $util->create_analysis(); + local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = "$test_data_dir_47995/samplesheet_47995.csv"; my $pipeline; lives_ok { $pipeline = $central->new( - runfolder_path => $runfolder_path, + runfolder_path => $tdir, ); - } q{no croak creating new object}; + } q{no error creating new object}; isa_ok($pipeline, $central); -} -{ - local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = q[t/data/samplesheet_1234.csv]; - my $pb; lives_ok { - $pb = $central->new( + $pipeline = $central->new( function_order => [qw(qc_qX_yield qc_insert_size)], - runfolder_path => $runfolder_path, + runfolder_path => $tdir, ); - } q{no croak on creation}; - $util->create_analysis(); - is(join(q[ ], @{$pb->function_order()}), 'qc_qX_yield qc_insert_size', + } q{no error on creation}; + is(join(q[ ], @{$pipeline->function_order()}), 'qc_qX_yield qc_insert_size', 'function_order set on creation'); -} +}; -{ - local $ENV{CLASSPATH} = undef; - local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = q[t/data/samplesheet_1234.csv]; - my $pb; - $util->create_analysis(); - cp 't/data/run_params/runParameters.hiseq.xml', - join(q[/], $runfolder_path, 'runParameters.xml'); +subtest 'execute main()' => sub { + plan tests => 2; + local $ENV{CLASSPATH} = undef; + local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = "$test_data_dir_47995/samplesheet_47995.csv"; + my $rf_info = _setup_runfolder_47995(); my $config_dir = 'data/config_files'; - my $init = { - function_order => [qw{qc_qX_yield qc_adapter update_ml_warehouse qc_insert_size}], - lanes => [4], - runfolder_path => $runfolder_path, - function_list => "$config_dir/function_list_central.json", - id_flowcell_lims => 2015, - no_bsub => 1, - repository => 't/data/sequence', - spider => 0, - no_sf_resource => 1, - product_conf_file_path => $product_config, - }; - lives_ok { $pb = $central->new($init); } q{no croak on new creation}; - mkdir $pb->archive_path; + my $pb; + lives_ok { $pb = $central->new( + id_run => 47995, + function_order => [qw{qc_qX_yield qc_adapter update_ml_warehouse qc_insert_size}], + lanes => [4], + run_folder => $rf_info->{'runfolder_name'}, + runfolder_path => $rf_info->{'runfolder_path'}, + function_list => "$config_dir/function_list_central.json", + id_flowcell_lims => 17089, + no_bsub => 1, + repository => 't/data/sequence', + spider => 0, + product_conf_file_path => $product_config, + ); } q{no croak on new creation}; + lives_ok { $pb->main() } q{no croak running qc->main()}; -} +}; + +subtest 'execute prepare()' => sub { + plan tests => 12; -{ - local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = q[t/data/samplesheet_1234.csv]; - my $rf = join q[/], $tdir, 'myfolder'; - mkdir $rf; - cp 't/data/run_params/runParameters.hiseq.xml', - join(q[/], $rf, 'runParameters.xml'); + local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = "$test_data_dir_47995/samplesheet_47995.csv"; + my $rf_info = _setup_runfolder_47995(); + my $rf = $rf_info->{'runfolder_path'}; my $init = { - id_run => 1234, - run_folder => 'myfolder', + id_run => 47995, + run_folder => $rf_info->{'runfolder_name'}, runfolder_path => $rf, timestamp => '22-May', spider => 0, @@ -103,14 +113,6 @@ my $runfolder_path = $util->analysis_runfolder_path(); my $pb = $central->new($init); is ($pb->intensity_path, "$rf/Data/Intensities", 'intensities path'); is ($pb->basecall_path, "$rf/Data/Intensities/BaseCalls", 'basecalls path'); - throws_ok { $pb->prepare() } - qr/does not exist, either bam_basecall_path or analysis_path should be given/, - q{error scaffolding the run folder}; - - make_path "$rf/Data/Intensities"; - $pb = $central->new($init); - is ($pb->intensity_path, "$rf/Data/Intensities", 'intensities path'); - is ($pb->basecall_path, "$rf/Data/Intensities/BaseCalls", 'basecalls path'); lives_ok { $pb->prepare() } 'prepare runs fine'; my $expected_pb_cal = join q[/],$rf,q{Data/Intensities/BAM_basecalls_22-May}; is ($pb->bam_basecall_path, $expected_pb_cal, 'bam basecall path is set'); @@ -137,6 +139,6 @@ my $runfolder_path = $util->analysis_runfolder_path(); $pb = $central->new($init); $pb->prepare(); is ($pb->bam_basecall_path, $expected_pb_cal, 'bam basecall path is set'); -} +}; 1; diff --git a/t/20-function-cluster_count.t b/t/20-function-cluster_count.t index 51131d11..2423e1a4 100644 --- a/t/20-function-cluster_count.t +++ b/t/20-function-cluster_count.t @@ -1,10 +1,13 @@ use strict; use warnings; -use English qw{-no_match_vars}; -use Test::More tests => 27; +use English qw(-no_match_vars); +use Test::More tests => 23; use Test::Exception; use Log::Log4perl qw(:levels); use File::Copy qw(cp); +use File::Temp qw(tempdir); +use File::Path qw(make_path); + use t::util; use_ok( q{npg_pipeline::function::cluster_count} ); @@ -17,16 +20,33 @@ Log::Log4perl->easy_init({layout => '%d %-5p %c - %m%n', file => join(q[/], $dir, 'logfile'), utf8 => 1}); -$util->create_analysis(); -my $analysis_runfolder_path = $util->analysis_runfolder_path(); -my $bam_basecall_path = $util->standard_bam_basecall_path(); -my $recalibrated_path = $util->standard_analysis_recalibrated_path(); -my $archive_path = $recalibrated_path . q{/archive}; +my $test_data_dir_47995 = 't/data/novaseqx/20231017_LH00210_0012_B22FCNFLT3'; + +sub _setup_runfolder_47995 { + + my $tmp_dir = tempdir(CLEANUP => 1); + my @dirs = split q[/], $test_data_dir_47995; + my $rf_name = pop @dirs; + my $rf_info = $util->create_runfolder($tmp_dir, + {'runfolder_name' => $rf_name, 'analysis_path' => 'BAM_basecalls_20240508-204057'}); + my $rf = $rf_info->{'runfolder_path'}; + for my $file (qw(RunInfo.xml RunParameters.xml)) { + if (cp("$test_data_dir_47995/$file", "$rf/$file") == 0) { + die "Failed to copy $file"; + } + } + $rf_info->{'bam_basecall_path'} = $rf_info->{'analysis_path'}; + + my $archive_path = $rf_info->{'archive_path'}; + my @paths = map { "$archive_path/lane$_/qc" } (1 .. 8); + make_path(@paths); -cp 't/data/run_params/runParameters.miseq.xml', "$analysis_runfolder_path/runParameters.xml"; + my $nocall_path = $rf_info->{'nocal_path'}; + `touch $nocall_path/47995_bfs_fofn.txt`; + `touch $nocall_path/47995_sf_fofn.txt`; -`touch $recalibrated_path/1234_bfs_fofn.txt`; -`touch $recalibrated_path/1234_sf_fofn.txt`; + return $rf_info; +} my $default = { default => { @@ -36,152 +56,155 @@ my $default = { }; { - local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = q[t/data/samplesheet_1234.csv]; + my $rf_info = _setup_runfolder_47995(); + my $bam_basecall_path = $rf_info->{'bam_basecall_path'}; + my $runfolder_path = $rf_info->{'runfolder_path'}; + my $archive_path = $rf_info->{'archive_path'}; + my $id_run = 47995; + local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = "$test_data_dir_47995/samplesheet_47995.csv"; my $object; lives_ok { $object = npg_pipeline::function::cluster_count->new( - run_folder => q{123456_IL2_1234}, - runfolder_path => $analysis_runfolder_path, - bam_basecall_path => $bam_basecall_path, - id_run => 1234, - timestamp => q{20100907-142417}, + runfolder_path => $rf_info->{'runfolder_path'}, + bam_basecall_path => $rf_info->{'bam_basecall_path'}, + id_run => $id_run, is_indexed => 0, bfs_fofp_name => q{}, sf_fofp_name => q{}, - resource => $default + resource => $default, ); } q{obtain object ok}; - isa_ok( $object, q{npg_pipeline::function::cluster_count}); my $da = $object->create(); - ok ($da && @{$da} == 1, 'an array with one definition is returned for eight lanes (run-level check)'); - + ok ($da && @{$da} == 1, + 'an array with one definition is returned for eight lanes (run-level check)'); my $d = $da->[0]; is ($d->created_by, 'npg_pipeline::function::cluster_count', 'created_by is correct'); is ($d->created_on, $object->timestamp, 'created_on is correct'); - is ($d->identifier, 1234, 'identifier is set correctly'); + is ($d->identifier, $id_run, 'identifier is set correctly'); ok (!$d->excluded, 'step not excluded'); ok (!$d->has_composition, 'composition is not set'); lives_ok {$d->freeze()} 'definition can be serialized to JSON'; - - my $values = {}; - map {$values->{ref $_} += 1} @{$da}; - is ($values->{'npg_pipeline::function::definition'}, 1, - 'one definition object returned'); - - map {$values->{$_->job_name} += 1} @{$da}; - is ($values->{'npg_pipeline_check_cluster_count_1234_20100907-142417'}, 1, + like ($d->job_name, qr/\Anpg_pipeline_check_cluster_count_$id_run/, 'the job is named correctly'); - - map {$values->{$_->queue} += 1} @{$da}; - is ($values->{'default'}, 1, 'the queue is set to default for the definition'); - - my $command = sprintf q[npg_pipeline_check_cluster_count --id_run=1234 --lanes=1 --lanes=2 --lanes=3 --lanes=4 --lanes=5 --lanes=6 --lanes=7 --lanes=8 --bam_basecall_path=%s --runfolder_path=%s %s %s], $bam_basecall_path, $analysis_runfolder_path, join(q{ }, (map {qq[--bfs_paths=$archive_path/lane$_/qc]} (1..8))), join(q{ }, (map {qq[--sf_paths=$archive_path/lane$_/qc]} (1..8))); -# my $command = sprintf q[npg_pipeline_check_cluster_count --id_run=1234 --lanes=1 --lanes=2 --lanes=3 --lanes=4 --lanes=5 --lanes=6 --lanes=7 --lanes=8 --bam_basecall_path=%s --runfolder_path=%s --bfs_fofp_name=%s/1234_bfs_fofn.txt --sf_fofp_name=%s/1234_sf_fofn.txt], $bam_basecall_path, $analysis_runfolder_path, $recalibrated_path, $recalibrated_path; - + is ($d->queue, 'default', 'the queue is set to default for the definition'); + + my $bfs_paths = join q{ }, (map {qq[--bfs_paths=$archive_path/lane$_/qc]} (1..8)); + my $sf_paths = join q{ }, (map {qq[--sf_paths=$archive_path/lane$_/qc]} (1..8)); + my $command = sprintf q[npg_pipeline_check_cluster_count --id_run=%i ] . + q[--lanes=1 --lanes=2 --lanes=3 --lanes=4 --lanes=5 --lanes=6 --lanes=7 --lanes=8 ] . + q[--bam_basecall_path=%s --runfolder_path=%s %s %s], + $id_run, $bam_basecall_path, $runfolder_path, $bfs_paths, $sf_paths; is ($da->[0]->command, $command, 'correct command'); } { local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = q[t/data/samplesheet_8747.csv]; - my $analysis_runfolder_path = 't/data/example_runfolder/121103_HS29_08747_B_C1BV5ACXX'; - my $bam_basecall_path = "$analysis_runfolder_path/Data/Intensities/BAM_basecalls_20130122-085552"; + my $runfolder_path = 't/data/example_runfolder/121103_HS29_08747_B_C1BV5ACXX'; + my $bam_basecall_path = "$runfolder_path/Data/Intensities/BAM_basecalls_20130122-085552"; my $archive_path = "$bam_basecall_path/no_cal/archive"; - my $object; - lives_ok{ - $object = npg_pipeline::function::cluster_count->new( - id_run => 8747, - lanes => [1], - runfolder_path => $analysis_runfolder_path, - bam_basecall_path => $bam_basecall_path, - archive_path => $archive_path, - bfs_paths => [ qq[$archive_path/lane1/qc] ], - bfs_fofp_name => q{}, - sf_fofp_name => q{}, - resource => $default - ); - } q{obtain object ok}; - + my $object = npg_pipeline::function::cluster_count->new( + id_run => 8747, + lanes => [1], + runfolder_path => $runfolder_path, + bam_basecall_path => $bam_basecall_path, + archive_path => $archive_path, + bfs_paths => [ qq[$archive_path/lane1/qc] ], + bfs_fofp_name => q{}, + sf_fofp_name => q{}, + resource => $default, + ); lives_ok { $object->run_cluster_count_check(); - } q{check returns ok}; + } q{check runs ok}; } { - local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = q[t/data/samplesheet_1234.csv]; - my $object; - lives_ok{ - $object = npg_pipeline::function::cluster_count->new( - run_folder => q{123456_IL2_1234}, - runfolder_path => $analysis_runfolder_path, - bam_basecall_path => $bam_basecall_path, - archive_path => $archive_path, - bfs_paths => [ qq{$archive_path/lane3/qc} ], - bfs_fofp_name => q{}, - sf_fofp_name => q{}, - resource => $default - ); - } q{obtain object ok}; - + my $rf_info = _setup_runfolder_47995(); + my $bam_basecall_path = $rf_info->{'bam_basecall_path'}; + my $runfolder_path = $rf_info->{'runfolder_path'}; + my $archive_path = $rf_info->{'archive_path'}; + my $id_run = 47995; + local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = "$test_data_dir_47995/samplesheet_47995.csv"; + + my $object = npg_pipeline::function::cluster_count->new( + id_run => 47995, + runfolder_path => $runfolder_path, + bam_basecall_path => $bam_basecall_path, + archive_path => $archive_path, + bfs_paths => [ qq{$archive_path/lane3/qc} ], + bfs_fofp_name => q{}, + sf_fofp_name => q{}, + resource => $default + ); ok( !$object->_bam_cluster_count_total({}), 'no bam cluster count total returned'); my $is_indexed = 1; - qx{mkdir -p $archive_path/lane3/qc}; - qx{cp t/data/bam_flagstats/1234_3_bam_flagstats.json $archive_path/lane3/qc/1234_3#0_bam_flagstats.json}; - qx{cp t/data/bam_flagstats/1234_3_bam_flagstats.json $archive_path/lane3/qc/1234_3#1_bam_flagstats.json}; - - is( $object->_bam_cluster_count_total( {plex=>$is_indexed} ), 32, 'correct bam cluster count total for plexes'); - - qx{cp t/data/bam_flagstats/1234_3_phix_bam_flagstats.json $archive_path/lane3/qc/1234_3#0_phix_bam_flagstats.json}; - - is( $object->_bam_cluster_count_total( {plex=>$is_indexed} ), 46, 'correct bam cluster count total for plexes'); + cp("t/data/bam_flagstats/${id_run}_3_bam_flagstats.json", + "$archive_path/lane3/qc/${id_run}_3#0_bam_flagstats.json"); + cp("t/data/bam_flagstats/${id_run}_3_bam_flagstats.json", + "$archive_path/lane3/qc/${id_run}_3#1_bam_flagstats.json"); + is( $object->_bam_cluster_count_total( {plex=>$is_indexed} ), 32, + 'correct bam cluster count total for plexes'); + + cp("t/data/bam_flagstats/${id_run}_3_phix_bam_flagstats.json", + "$archive_path/lane3/qc/${id_run}_3#0_phix_bam_flagstats.json"); + is( $object->_bam_cluster_count_total( {plex=>$is_indexed} ), 46, + 'correct bam cluster count total for plexes'); } { local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = q[t/data/samplesheet_8747.csv]; - my $analysis_runfolder_path = 't/data/example_runfolder/121103_HS29_08747_B_C1BV5ACXX'; - my $bam_basecall_path = "$analysis_runfolder_path/Data/Intensities/BAM_basecalls_20130122-085552"; + my $runfolder_path = 't/data/example_runfolder/121103_HS29_08747_B_C1BV5ACXX'; + my $bam_basecall_path = "$runfolder_path/Data/Intensities/BAM_basecalls_20130122-085552"; my $archive_path = "$bam_basecall_path/no_cal/archive"; - my $object; - lives_ok{ - $object = npg_pipeline::function::cluster_count->new( - id_run => 8747, - lanes => [1], - runfolder_path => $analysis_runfolder_path, - bam_basecall_path => $bam_basecall_path, - archive_path => $archive_path, - bfs_paths => [ qq{$archive_path/lane1/qc} ], - sf_paths => [ qq{$archive_path/lane1/qc} ], - bfs_fofp_name => q{}, - sf_fofp_name => q{}, - resource => $default - ); - } q{obtain object ok}; + my $object = npg_pipeline::function::cluster_count->new( + id_run => 8747, + lanes => [1], + runfolder_path => $runfolder_path, + bam_basecall_path => $bam_basecall_path, + archive_path => $archive_path, + bfs_paths => [ qq{$archive_path/lane1/qc} ], + sf_paths => [ qq{$archive_path/lane1/qc} ], + bfs_fofp_name => q{}, + sf_fofp_name => q{}, + resource => $default + ); - is( $object->_bam_cluster_count_total({plex=>1}), 301389338, 'correct bam cluster count total'); - rename "$archive_path/lane1/qc/8747_1#0_bam_flagstats.json", "$archive_path/lane1/qc/8747_1#0_bam_flagstats.json.RENAMED"; - throws_ok {$object->run_cluster_count_check()} qr{Cluster count in bam files not as expected}, 'Cluster count in bam files not as expected'; - rename "$archive_path/lane1/qc/8747_1#0_bam_flagstats.json.RENAMED", "$archive_path/lane1/qc/8747_1#0_bam_flagstats.json"; + is( $object->_bam_cluster_count_total({plex=>1}), 301389338, + 'correct bam cluster count total'); + rename "$archive_path/lane1/qc/8747_1#0_bam_flagstats.json", + "$archive_path/lane1/qc/8747_1#0_bam_flagstats.json.RENAMED"; + throws_ok {$object->run_cluster_count_check()} + qr{Cluster count in bam files not as expected}, + 'Cluster count in bam files not as expected'; + rename "$archive_path/lane1/qc/8747_1#0_bam_flagstats.json.RENAMED", + "$archive_path/lane1/qc/8747_1#0_bam_flagstats.json"; ok($object->run_cluster_count_check(), 'Cluster count in bam files as expected'); } { local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = q[t/data/samplesheet_8747.csv]; - my $analysis_runfolder_path = 't/data/example_runfolder/121103_HS29_08747_B_C1BV5ACXX'; - my $bam_basecall_path = "$analysis_runfolder_path/Data/Intensities/BAM_basecalls_20130122-085552"; + my $runfolder_path = 't/data/example_runfolder/121103_HS29_08747_B_C1BV5ACXX'; + my $bam_basecall_path = "$runfolder_path/Data/Intensities/BAM_basecalls_20130122-085552"; my $archive_path = "$bam_basecall_path/no_cal/archive"; my $recalibrated_path = "$bam_basecall_path/no_cal"; my $common_command = sub { my $p = shift; - return sprintf q{$EXECUTABLE_NAME bin/npg_pipeline_check_cluster_count --bfs_fofp_name %s/lane%d/8747_bfs_fofn.txt --sf_fofp_name %s/lane%d/8747_sf_fofn.txt --id_run 8747 --bam_basecall_path %s --lanes %d --runfolder_path %s}, $archive_path, $p, $archive_path, $p, $bam_basecall_path, $p, $analysis_runfolder_path; + return sprintf q{$EXECUTABLE_NAME bin/npg_pipeline_check_cluster_count } . + q{--bfs_fofp_name %s/lane%d/8747_bfs_fofn.txt } . + q{--sf_fofp_name %s/lane%d/8747_sf_fofn.txt --id_run 8747 } . + q{--bam_basecall_path %s --lanes %d --runfolder_path %s}, + $archive_path, $p, $archive_path, $p, $bam_basecall_path, $p, + $runfolder_path; }; my $c; diff --git a/t/20-function-p4_stage1_analysis.t b/t/20-function-p4_stage1_analysis.t index 12ac4976..1a87a296 100644 --- a/t/20-function-p4_stage1_analysis.t +++ b/t/20-function-p4_stage1_analysis.t @@ -1,12 +1,12 @@ use strict; use warnings; -use Test::More tests => 5; +use Test::More tests => 4; use Test::Exception; -use Cwd qw(getcwd abs_path); use File::Copy qw(cp); use File::Copy::Recursive qw(dircopy); use Perl6::Slurp; use JSON; +use File::Temp qw(tempdir); use t::util; @@ -14,15 +14,6 @@ my $util = t::util->new(clean_temp_directory => 1); my $dir = $util->temp_directory(); use_ok('npg_pipeline::function::p4_stage1_analysis'); -my $current = abs_path(getcwd()); - -# Copy cache dir to a temp location since a tag file will -# be created there. -my $new = "$dir/1234_samplesheet.csv"; -`cp -r t/data/p4_stage1_analysis/* $dir`; -local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = $new; -local $ENV{'http_proxy'} = 'http://wibble.com'; -local $ENV{'no_proxy'} = q{}; my $default = { default => { @@ -44,59 +35,71 @@ my $repos_root = $dir . q{/srpipe_references}; `touch $repos_root/references/PhiX/default/all/fasta/phix_unsnipped_short_no_N.fa`; `touch $repos_root/references/PhiX/default/all/minimap2/phix_unsnipped_short_no_N.fa.mmi`; -$util->create_analysis(); -my $runfolder = $util->analysis_runfolder_path() . '/'; -cp('t/data/runfolder/Data/RunInfo.xml', $runfolder) or die 'Failed to copy run info'; -cp('t/data/run_params/runParameters.miseq.xml', $runfolder . 'runParameters.xml') or - die 'Failed to copy run params'; - -my $bc_path = q{/nfs/sf45/IL2/analysis/123456_IL2_1234/Data/Intensities/BaseCalls}; - -my $bam_generator = npg_pipeline::function::p4_stage1_analysis->new( - run_folder => q{123456_IL2_1234}, +sub _create_runfolder { + my $rf_name = shift; + + my $tdir = tempdir(CLEANUP => 1); + my $id_run = 24347; + my $rf_info = $util->create_runfolder($tdir, + {'runfolder_name' => $rf_name, analysis_path => 'BAM_basecalls'}); + my $runfolder = $rf_info->{'runfolder_path'}; + my $bam_basecall_path = $rf_info->{'analysis_path'}; + + cp('t/data/miseq/24347_RunInfo.xml', "$runfolder/RunInfo.xml") + or die 'Failed to copy run info'; + cp('t/data/run_params/runParameters.miseq.xml', "$runfolder/runParameters.xml") + or die 'Failed to copy run params'; + + my $interop_dir = join q[/], $runfolder, 'InterOp'; + mkdir $interop_dir; + my $tm_file = 'TileMetricsOut.bin'; + cp("t/data/p4_stage1_analysis/$tm_file", "$interop_dir/$tm_file") + or die 'Failed to copy the InterOp file'; + mkdir join(q[/], $bam_basecall_path , "metadata_cache_${id_run}") + or die 'Failed to create directory'; + + return $rf_info; +} + +subtest 'check saving arguments' => sub { + plan tests => 25; + + my $id_run = 24347; + my $rf_name = '171114_MS6_24347_A_MS5534842-300V2'; + my $run_info = _create_runfolder($rf_name); + my $runfolder = $run_info->{'runfolder_path'}; + my $bam_basecall_path = $run_info->{'analysis_path'}; + my $no_cal_path = join q[/], $bam_basecall_path, 'no_cal'; + my $intensities_dir = $run_info->{'intensity_path'}; + my $timestamp = '20240514'; + + local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = 't/data/p4_stage1_analysis/samplesheet.csv'; + + my $bam_generator = npg_pipeline::function::p4_stage1_analysis->new( repository => $repos_root, - runfolder_path => $util->analysis_runfolder_path(), - timestamp => q{20090709-123456}, + runfolder_path => $runfolder, verbose => 0, - id_run => 1234, + id_run => $id_run, _extra_tradis_transposon_read => 1, - bam_basecall_path => $util->standard_bam_basecall_path(), - resource => $default -); - -mkdir join(q[/], $bam_generator->bam_basecall_path(), 'metadata_cache_1234') - or die 'Failed to create directory'; - -subtest 'basics' => sub { - plan tests => 5; - + bam_basecall_path => $bam_basecall_path, + resource => $default, + timestamp => $timestamp, + ); isa_ok($bam_generator, q{npg_pipeline::function::p4_stage1_analysis}, q{$bam_generator}); is($bam_generator->_extra_tradis_transposon_read, 1, 'TraDIS set'); $bam_generator->_extra_tradis_transposon_read(0); is($bam_generator->_extra_tradis_transposon_read, 0, 'TraDIS not set'); isa_ok($bam_generator->lims, 'st::api::lims', 'cached lims object'); - my $alims = $bam_generator->lims->children_ia; - my $position = 8; - is($bam_generator->_get_number_of_plexes_excluding_control($alims->{$position}), - 2, 'correct number of plexes'); -}; - -subtest 'check_save_arguments' => sub { - plan tests => 29; - - my $bbp = $bam_generator->bam_basecall_path; - my $unique = $bam_generator->_job_id(); my $da = $bam_generator->generate(); - ok ($da && @{$da}==8, 'eight definitions returned'); + ok ($da && @{$da}==1, 'one definition is returned'); my $d = $da->[0]; isa_ok ($d, 'npg_pipeline::function::definition'); is ($d->created_by, 'npg_pipeline::function::p4_stage1_analysis', 'created by'); - is ($d->created_on, q{20090709-123456}, 'created on'); - is ($d->identifier, 1234, 'identifier'); + is ($d->identifier, $id_run, 'identifier'); ok (!$d->excluded, 'step is not excluded'); is ($d->queue, 'p4stage1', 'special queue'); - is ($d->job_name, 'p4_stage1_analysis_1234_20090709-123456', 'job name'); + like ($d->job_name, qr/\Ap4_stage1_analysis_$id_run/, 'job name'); is ($d->fs_slots_num, 4, '4 sf slots'); is ($d->num_hosts, 1, 'one host'); is_deeply ($d->num_cpus, [8], 'num cpus as an array'); @@ -109,73 +112,70 @@ subtest 'check_save_arguments' => sub { isa_ok ($composition, 'npg_tracking::glossary::composition'); is ($composition->num_components, 1, 'one component'); my $component = $composition->get_component(0); - is ($component->id_run, 1234, 'run id correct'); + is ($component->id_run, $id_run, 'run id correct'); is ($component->position, 1, 'position correct'); ok (!defined $component->tag_index, 'tag index undefined'); - my $intensities_dir = $dir . '/nfs/sf45/IL2/analysis/123456_IL2_1234/Data/Intensities'; - my $expected = { - '1' => 'bash -c \' cd ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane1/log && vtfp.pl -template_path $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib -o run_1234_1.json -param_vals ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane1/param_files/1234_1_p4s1_pv_in.json -export_param_vals 1234_1_p4s1_pv_out_' . $unique . '.json -keys cfgdatadir -vals $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib/ -keys aligner_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -2 --divide 3` -keys s2b_mt_val -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -1 --divide 3` -keys bamsormadup_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --divide 3` -keys br_numthreads_val -vals `npg_pipeline_job_env_to_threads --num_threads 8` $(dirname $(dirname $(readlink -f $(which vtfp.pl))))/data/vtlib/bcl2bam_phix_deplex_wtsi_stage1_template.json && viv.pl -s -x -v 3 -o viv_1234_1.log run_1234_1.json \'', - '2' => 'bash -c \' cd ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane2/log && vtfp.pl -template_path $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib -o run_1234_2.json -param_vals ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane2/param_files/1234_2_p4s1_pv_in.json -export_param_vals 1234_2_p4s1_pv_out_' . $unique . '.json -keys cfgdatadir -vals $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib/ -keys aligner_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -2 --divide 3` -keys s2b_mt_val -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -1 --divide 3` -keys bamsormadup_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --divide 3` -keys br_numthreads_val -vals `npg_pipeline_job_env_to_threads --num_threads 8` $(dirname $(dirname $(readlink -f $(which vtfp.pl))))/data/vtlib/bcl2bam_phix_deplex_wtsi_stage1_template.json && viv.pl -s -x -v 3 -o viv_1234_2.log run_1234_2.json \'', - '3' => 'bash -c \' cd ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane3/log && vtfp.pl -template_path $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib -o run_1234_3.json -param_vals ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane3/param_files/1234_3_p4s1_pv_in.json -export_param_vals 1234_3_p4s1_pv_out_' . $unique . '.json -keys cfgdatadir -vals $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib/ -keys aligner_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -2 --divide 3` -keys s2b_mt_val -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -1 --divide 3` -keys bamsormadup_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --divide 3` -keys br_numthreads_val -vals `npg_pipeline_job_env_to_threads --num_threads 8` $(dirname $(dirname $(readlink -f $(which vtfp.pl))))/data/vtlib/bcl2bam_phix_deplex_wtsi_stage1_template.json && viv.pl -s -x -v 3 -o viv_1234_3.log run_1234_3.json \'', - '4' => 'bash -c \' cd ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane4/log && vtfp.pl -template_path $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib -o run_1234_4.json -param_vals ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane4/param_files/1234_4_p4s1_pv_in.json -export_param_vals 1234_4_p4s1_pv_out_' . $unique . '.json -keys cfgdatadir -vals $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib/ -keys aligner_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -2 --divide 3` -keys s2b_mt_val -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -1 --divide 3` -keys bamsormadup_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --divide 3` -keys br_numthreads_val -vals `npg_pipeline_job_env_to_threads --num_threads 8` $(dirname $(dirname $(readlink -f $(which vtfp.pl))))/data/vtlib/bcl2bam_phix_deplex_wtsi_stage1_template.json && viv.pl -s -x -v 3 -o viv_1234_4.log run_1234_4.json \'', - '5' => 'bash -c \' cd ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane5/log && vtfp.pl -template_path $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib -o run_1234_5.json -param_vals ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane5/param_files/1234_5_p4s1_pv_in.json -export_param_vals 1234_5_p4s1_pv_out_' . $unique . '.json -keys cfgdatadir -vals $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib/ -keys aligner_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -2 --divide 3` -keys s2b_mt_val -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -1 --divide 3` -keys bamsormadup_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --divide 3` -keys br_numthreads_val -vals `npg_pipeline_job_env_to_threads --num_threads 8` $(dirname $(dirname $(readlink -f $(which vtfp.pl))))/data/vtlib/bcl2bam_phix_deplex_wtsi_stage1_template.json && viv.pl -s -x -v 3 -o viv_1234_5.log run_1234_5.json \'', - '6' => 'bash -c \' cd ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane6/log && vtfp.pl -template_path $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib -o run_1234_6.json -param_vals ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane6/param_files/1234_6_p4s1_pv_in.json -export_param_vals 1234_6_p4s1_pv_out_' . $unique . '.json -keys cfgdatadir -vals $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib/ -keys aligner_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -2 --divide 3` -keys s2b_mt_val -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -1 --divide 3` -keys bamsormadup_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --divide 3` -keys br_numthreads_val -vals `npg_pipeline_job_env_to_threads --num_threads 8` $(dirname $(dirname $(readlink -f $(which vtfp.pl))))/data/vtlib/bcl2bam_phix_deplex_wtsi_stage1_template.json && viv.pl -s -x -v 3 -o viv_1234_6.log run_1234_6.json \'', - '7' => 'bash -c \' cd ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane7/log && vtfp.pl -template_path $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib -o run_1234_7.json -param_vals ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane7/param_files/1234_7_p4s1_pv_in.json -export_param_vals 1234_7_p4s1_pv_out_' . $unique . '.json -keys cfgdatadir -vals $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib/ -keys aligner_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -2 --divide 3` -keys s2b_mt_val -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -1 --divide 3` -keys bamsormadup_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --divide 3` -keys br_numthreads_val -vals `npg_pipeline_job_env_to_threads --num_threads 8` $(dirname $(dirname $(readlink -f $(which vtfp.pl))))/data/vtlib/bcl2bam_phix_deplex_wtsi_stage1_template.json && viv.pl -s -x -v 3 -o viv_1234_7.log run_1234_7.json \'', - '8' => 'bash -c \' cd ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane8/log && vtfp.pl -template_path $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib -o run_1234_8.json -param_vals ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane8/param_files/1234_8_p4s1_pv_in.json -export_param_vals 1234_8_p4s1_pv_out_' . $unique . '.json -keys cfgdatadir -vals $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib/ -keys aligner_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -2 --divide 3` -keys s2b_mt_val -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -1 --divide 3` -keys bamsormadup_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --divide 3` -keys br_numthreads_val -vals `npg_pipeline_job_env_to_threads --num_threads 8` $(dirname $(dirname $(readlink -f $(which vtfp.pl))))/data/vtlib/bcl2bam_phix_deplex_wtsi_stage1_template.json && viv.pl -s -x -v 3 -o viv_1234_8.log run_1234_8.json \'', - }; - - foreach my $d (@{$da}) { - my $p = $d->composition()->get_component(0)->position(); - is ($d->command, $expected->{$p}, "command correct for lane $p"); - } - - my $pfname = $intensities_dir . q[/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane1/param_files/1234_1_p4s1_pv_in.json]; + my $p4stage1_dir = "$bam_basecall_path/p4_stage1_analysis"; + my $unique = $bam_generator->_job_id(); + my $expected = q(bash -c ' cd ) . $p4stage1_dir . '/lane1/log && vtfp.pl -template_path ' . + '$(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib ' . + "-o run_${id_run}_1.json -param_vals " . $p4stage1_dir . + "/lane1/param_files/${id_run}_1_p4s1_pv_in.json -export_param_vals ${id_run}_1_p4s1_pv_out_${unique}.json " . + '-keys cfgdatadir -vals $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib/ ' . + '-keys aligner_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -2 --divide 3` ' . + '-keys s2b_mt_val -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -1 --divide 3` ' . + '-keys bamsormadup_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --divide 3` ' . + '-keys br_numthreads_val -vals `npg_pipeline_job_env_to_threads --num_threads 8` ' . + '$(dirname $(dirname $(readlink -f $(which vtfp.pl))))/data/vtlib/bcl2bam_phix_deplex_wtsi_stage1_template.json ' . + "&& viv.pl -s -x -v 3 -o viv_${id_run}_1.log run_${id_run}_1.json " . q('); + + is ($da->[0]->command, $expected, 'command for lane 1'); + + my $pfname = $p4stage1_dir . "/lane1/param_files/${id_run}_1_p4s1_pv_in.json"; ok (-e $pfname, 'params file exists'); my $h = from_json(slurp($pfname)); - my $no_cal_path = $intensities_dir . '/BAM_basecalls_09-07-2009/no_cal'; - $expected = { 'assign' => [ { 'i2b_thread_count' => 8, - 'seqchksum_file' => $intensities_dir . '/BAM_basecalls_09-07-2009/1234_1.post_i2b.seqchksum', - 'scramble_reference_fasta' => $dir . '/srpipe_references/references/PhiX/default/all/fasta/phix_unsnipped_short_no_N.fa', - 'i2b_rg' => '1234_1', - 'i2b_pu' => '123456_IL2_1234_1', + 'seqchksum_file' => $bam_basecall_path . "/${id_run}_1.post_i2b.seqchksum", + 'scramble_reference_fasta' => $repos_root . '/references/PhiX/default/all/fasta/phix_unsnipped_short_no_N.fa', + 'i2b_rg' => "${id_run}_1", + 'i2b_pu' => "${rf_name}_1", 'tileviz_dir' => $no_cal_path . '/archive/lane1/tileviz', - 'reference_phix' => $dir . "/srpipe_references/references/PhiX/default/all/bwa0_6/phix_unsnipped_short_no_N.fa", - 'unfiltered_cram_file' => $no_cal_path . '/1234_1.unfiltered.cram', + 'reference_phix' => $repos_root . "/references/PhiX/default/all/bwa0_6/phix_unsnipped_short_no_N.fa", + 'unfiltered_cram_file' => $no_cal_path . "/${id_run}_1.unfiltered.cram", 'qc_check_qc_out_dir' => $no_cal_path . '/archive/lane1/qc', 'i2b_lane' => '1', 'bwa_executable' => 'bwa0_6', - 'filtered_bam' => $no_cal_path . '/1234_1.bam', + 'filtered_bam' => "${no_cal_path}/${id_run}_1.bam", 'samtools_executable' => 'samtools', 'i2b_library_name' => '51021', 'outdatadir' => $no_cal_path, 'subsetsubpath' => $no_cal_path . '/archive/lane1/.npg_cache_10000', - 'i2b_run_path' => $dir . q[/nfs/sf45/IL2/analysis/123456_IL2_1234], + 'i2b_run_path' => $runfolder, 'teepot_tempdir' => '.', 'split_prefix' => $no_cal_path, 'i2b_intensity_dir' => $intensities_dir, 'i2b_sample_aliases' => 'SRS000147', 'phix_alignment_method' => 'bwa_aln_se', - 'md5filename' => $no_cal_path . '/1234_1.bam.md5', + 'md5filename' => "${no_cal_path}/${id_run}_1.bam.md5", 'teepot_mval' => '2G', - 'i2b_runfolder' => '123456_IL2_1234', + 'i2b_runfolder' => $rf_name, 'i2b_study_name' => '"SRP000031: 1000Genomes Project Pilot 1"', 'i2b_basecalls_dir' => $intensities_dir . '/BaseCalls', 'teepot_wval' => '500', - 'qc_check_qc_in_dir' => $intensities_dir . '/BAM_basecalls_09-07-2009', - 'qc_check_id_run' => '1234', + 'qc_check_qc_in_dir' => $bam_basecall_path, + 'qc_check_id_run' => $id_run, 'cluster_count' => '500077065', - 'seed_frac' => '1234.00002000', + 'seed_frac' => "${id_run}.00002000", 'split_threads_val' => 4, 'aln_filter_value' => '0x900', 's1_se_pe' => 'se', 's1_output_format' => 'cram', - 'rpt_list' => '1234:1', + 'rpt_list' => "${id_run}:1", 'lane_archive_path' => $no_cal_path . '/archive/lane1', }, ], @@ -187,118 +187,96 @@ subtest 'check_save_arguments' => sub { }; is_deeply($h, $expected, 'correct json file content (for p4 stage1 params file)'); - }; -# check_save_arguments_minimap2 test duplicates check_save_arguments, but forces phix_aligment_method to minimap2 -$bam_generator = npg_pipeline::function::p4_stage1_analysis->new( - run_folder => q{123456_IL2_1234}, +subtest 'check_save_arguments_minimap2' => sub { + plan tests => 3; + + my $id_run = 24347; + my $rf_name = '171114_MS6_24347_A_MS5534842-300V2'; + my $run_info = _create_runfolder($rf_name); + my $runfolder = $run_info->{'runfolder_path'}; + my $bam_basecall_path = $run_info->{'analysis_path'}; + my $no_cal_path = "${bam_basecall_path}/no_cal"; + my $intensities_dir = $run_info->{'intensity_path'}; + my $p4stage1_dir = "${bam_basecall_path}/p4_stage1_analysis"; + + local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = 't/data/p4_stage1_analysis/samplesheet.csv'; + + # check_save_arguments_minimap2 test duplicates check_save_arguments, + # but forces phix_aligment_method to minimap2 + my $bam_generator = npg_pipeline::function::p4_stage1_analysis->new( + run_folder => $rf_name, repository => $repos_root, - runfolder_path => $util->analysis_runfolder_path(), - timestamp => q{20090709-123456}, + runfolder_path => $runfolder, verbose => 0, - id_run => 1234, - bam_basecall_path => $util->standard_bam_basecall_path(), + id_run => $id_run, + bam_basecall_path => $bam_basecall_path, p4s1_phix_alignment_method => q{minimap2}, resource => $default ); -subtest 'check_save_arguments_minimap2' => sub { - plan tests => 29; - - my $bbp = $bam_generator->bam_basecall_path; my $unique = $bam_generator->_job_id(); + my $expected = q(bash -c ' cd ) . $p4stage1_dir . '/lane1/log && vtfp.pl ' . + '-template_path $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib ' . + "-o run_${id_run}_1.json -param_vals $p4stage1_dir/lane1/param_files/${id_run}_1_p4s1_pv_in.json " . + "-export_param_vals ${id_run}_1_p4s1_pv_out_${unique}.json " . + '-keys cfgdatadir -vals $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib/ ' . + '-keys aligner_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -2 --divide 3` ' . + '-keys s2b_mt_val -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -1 --divide 3` ' . + '-keys bamsormadup_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --divide 3` ' . + '-keys br_numthreads_val -vals `npg_pipeline_job_env_to_threads --num_threads 8` ' . + '$(dirname $(dirname $(readlink -f $(which vtfp.pl))))/data/vtlib/bcl2bam_phix_deplex_wtsi_stage1_template.json ' . + "&& viv.pl -s -x -v 3 -o viv_${id_run}_1.log run_${id_run}_1.json" . q( '); my $da = $bam_generator->generate(); - ok ($da && @{$da}==8, 'eight definitions returned'); - my $d = $da->[0]; - isa_ok ($d, 'npg_pipeline::function::definition'); - is ($d->created_by, 'npg_pipeline::function::p4_stage1_analysis', 'created by'); - is ($d->created_on, q{20090709-123456}, 'created on'); - is ($d->identifier, 1234, 'identifier'); - ok (!$d->excluded, 'step is not excluded'); - is ($d->queue, 'p4stage1', 'special queue'); - is ($d->job_name, 'p4_stage1_analysis_1234_20090709-123456', 'job name'); - is ($d->fs_slots_num, 4, '4 sf slots'); - is ($d->num_hosts, 1, 'one host'); - is_deeply ($d->num_cpus, [8], 'num cpus as an array'); - is ($d->memory, 20000, 'memory'); - is ($d->command_preexec, - "npg_pipeline_preexec_references --repository $repos_root", - 'preexec command'); - ok ($d->has_composition, 'composition object is set'); - my $composition = $d->composition; - isa_ok ($composition, 'npg_tracking::glossary::composition'); - is ($composition->num_components, 1, 'one component'); - my $component = $composition->get_component(0); - is ($component->id_run, 1234, 'run id correct'); - is ($component->position, 1, 'position correct'); - ok (!defined $component->tag_index, 'tag index undefined'); + is ($da->[0]->command, $expected, "command correct for lane 1"); - my $intensities_dir = $dir . '/nfs/sf45/IL2/analysis/123456_IL2_1234/Data/Intensities'; - my $expected = { - '1' => 'bash -c \' cd ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane1/log && vtfp.pl -template_path $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib -o run_1234_1.json -param_vals ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane1/param_files/1234_1_p4s1_pv_in.json -export_param_vals 1234_1_p4s1_pv_out_' . $unique . '.json -keys cfgdatadir -vals $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib/ -keys aligner_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -2 --divide 3` -keys s2b_mt_val -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -1 --divide 3` -keys bamsormadup_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --divide 3` -keys br_numthreads_val -vals `npg_pipeline_job_env_to_threads --num_threads 8` $(dirname $(dirname $(readlink -f $(which vtfp.pl))))/data/vtlib/bcl2bam_phix_deplex_wtsi_stage1_template.json && viv.pl -s -x -v 3 -o viv_1234_1.log run_1234_1.json \'', - '2' => 'bash -c \' cd ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane2/log && vtfp.pl -template_path $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib -o run_1234_2.json -param_vals ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane2/param_files/1234_2_p4s1_pv_in.json -export_param_vals 1234_2_p4s1_pv_out_' . $unique . '.json -keys cfgdatadir -vals $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib/ -keys aligner_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -2 --divide 3` -keys s2b_mt_val -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -1 --divide 3` -keys bamsormadup_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --divide 3` -keys br_numthreads_val -vals `npg_pipeline_job_env_to_threads --num_threads 8` $(dirname $(dirname $(readlink -f $(which vtfp.pl))))/data/vtlib/bcl2bam_phix_deplex_wtsi_stage1_template.json && viv.pl -s -x -v 3 -o viv_1234_2.log run_1234_2.json \'', - '3' => 'bash -c \' cd ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane3/log && vtfp.pl -template_path $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib -o run_1234_3.json -param_vals ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane3/param_files/1234_3_p4s1_pv_in.json -export_param_vals 1234_3_p4s1_pv_out_' . $unique . '.json -keys cfgdatadir -vals $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib/ -keys aligner_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -2 --divide 3` -keys s2b_mt_val -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -1 --divide 3` -keys bamsormadup_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --divide 3` -keys br_numthreads_val -vals `npg_pipeline_job_env_to_threads --num_threads 8` $(dirname $(dirname $(readlink -f $(which vtfp.pl))))/data/vtlib/bcl2bam_phix_deplex_wtsi_stage1_template.json && viv.pl -s -x -v 3 -o viv_1234_3.log run_1234_3.json \'', - '4' => 'bash -c \' cd ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane4/log && vtfp.pl -template_path $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib -o run_1234_4.json -param_vals ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane4/param_files/1234_4_p4s1_pv_in.json -export_param_vals 1234_4_p4s1_pv_out_' . $unique . '.json -keys cfgdatadir -vals $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib/ -keys aligner_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -2 --divide 3` -keys s2b_mt_val -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -1 --divide 3` -keys bamsormadup_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --divide 3` -keys br_numthreads_val -vals `npg_pipeline_job_env_to_threads --num_threads 8` $(dirname $(dirname $(readlink -f $(which vtfp.pl))))/data/vtlib/bcl2bam_phix_deplex_wtsi_stage1_template.json && viv.pl -s -x -v 3 -o viv_1234_4.log run_1234_4.json \'', - '5' => 'bash -c \' cd ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane5/log && vtfp.pl -template_path $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib -o run_1234_5.json -param_vals ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane5/param_files/1234_5_p4s1_pv_in.json -export_param_vals 1234_5_p4s1_pv_out_' . $unique . '.json -keys cfgdatadir -vals $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib/ -keys aligner_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -2 --divide 3` -keys s2b_mt_val -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -1 --divide 3` -keys bamsormadup_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --divide 3` -keys br_numthreads_val -vals `npg_pipeline_job_env_to_threads --num_threads 8` $(dirname $(dirname $(readlink -f $(which vtfp.pl))))/data/vtlib/bcl2bam_phix_deplex_wtsi_stage1_template.json && viv.pl -s -x -v 3 -o viv_1234_5.log run_1234_5.json \'', - '6' => 'bash -c \' cd ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane6/log && vtfp.pl -template_path $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib -o run_1234_6.json -param_vals ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane6/param_files/1234_6_p4s1_pv_in.json -export_param_vals 1234_6_p4s1_pv_out_' . $unique . '.json -keys cfgdatadir -vals $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib/ -keys aligner_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -2 --divide 3` -keys s2b_mt_val -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -1 --divide 3` -keys bamsormadup_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --divide 3` -keys br_numthreads_val -vals `npg_pipeline_job_env_to_threads --num_threads 8` $(dirname $(dirname $(readlink -f $(which vtfp.pl))))/data/vtlib/bcl2bam_phix_deplex_wtsi_stage1_template.json && viv.pl -s -x -v 3 -o viv_1234_6.log run_1234_6.json \'', - '7' => 'bash -c \' cd ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane7/log && vtfp.pl -template_path $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib -o run_1234_7.json -param_vals ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane7/param_files/1234_7_p4s1_pv_in.json -export_param_vals 1234_7_p4s1_pv_out_' . $unique . '.json -keys cfgdatadir -vals $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib/ -keys aligner_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -2 --divide 3` -keys s2b_mt_val -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -1 --divide 3` -keys bamsormadup_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --divide 3` -keys br_numthreads_val -vals `npg_pipeline_job_env_to_threads --num_threads 8` $(dirname $(dirname $(readlink -f $(which vtfp.pl))))/data/vtlib/bcl2bam_phix_deplex_wtsi_stage1_template.json && viv.pl -s -x -v 3 -o viv_1234_7.log run_1234_7.json \'', - '8' => 'bash -c \' cd ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane8/log && vtfp.pl -template_path $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib -o run_1234_8.json -param_vals ' . $intensities_dir . '/BAM_basecalls_09-07-2009/p4_stage1_analysis/lane8/param_files/1234_8_p4s1_pv_in.json -export_param_vals 1234_8_p4s1_pv_out_' . $unique . '.json -keys cfgdatadir -vals $(dirname $(readlink -f $(which vtfp.pl)))/../data/vtlib/ -keys aligner_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -2 --divide 3` -keys s2b_mt_val -vals `npg_pipeline_job_env_to_threads --num_threads 8 --exclude -1 --divide 3` -keys bamsormadup_numthreads -vals `npg_pipeline_job_env_to_threads --num_threads 8 --divide 3` -keys br_numthreads_val -vals `npg_pipeline_job_env_to_threads --num_threads 8` $(dirname $(dirname $(readlink -f $(which vtfp.pl))))/data/vtlib/bcl2bam_phix_deplex_wtsi_stage1_template.json && viv.pl -s -x -v 3 -o viv_1234_8.log run_1234_8.json \'', - }; - - foreach my $d (@{$da}) { - my $p = $d->composition()->get_component(0)->position(); - is ($d->command, $expected->{$p}, "command correct for lane $p"); - } - - my $pfname = $bbp . q[/p4_stage1_analysis/lane1/param_files/1234_1_p4s1_pv_in.json]; + my $pfname = "${p4stage1_dir}/lane1/param_files/${id_run}_1_p4s1_pv_in.json"; ok (-e $pfname, 'params file exists'); my $h = from_json(slurp($pfname)); - my $no_cal_path = $intensities_dir . '/BAM_basecalls_09-07-2009/no_cal'; - $expected = { 'assign' => [ { 'i2b_thread_count' => 8, - 'seqchksum_file' => $intensities_dir . '/BAM_basecalls_09-07-2009/1234_1.post_i2b.seqchksum', - 'scramble_reference_fasta' => $dir . '/srpipe_references/references/PhiX/default/all/fasta/phix_unsnipped_short_no_N.fa', - 'i2b_rg' => '1234_1', - 'i2b_pu' => '123456_IL2_1234_1', + 'seqchksum_file' => "$bam_basecall_path/${id_run}_1.post_i2b.seqchksum", + 'scramble_reference_fasta' => $repos_root . '/references/PhiX/default/all/fasta/phix_unsnipped_short_no_N.fa', + 'i2b_rg' => "${id_run}_1", + 'i2b_pu' => "${rf_name}_1", 'tileviz_dir' => $no_cal_path . '/archive/lane1/tileviz', - 'reference_phix' => $dir . '/srpipe_references/references/PhiX/default/all/minimap2/phix_unsnipped_short_no_N.fa.mmi', - 'unfiltered_cram_file' => $no_cal_path . '/1234_1.unfiltered.cram', + 'reference_phix' => $repos_root . '/references/PhiX/default/all/minimap2/phix_unsnipped_short_no_N.fa.mmi', + 'unfiltered_cram_file' => "${no_cal_path}/${id_run}_1.unfiltered.cram", 'qc_check_qc_out_dir' => $no_cal_path . '/archive/lane1/qc', 'i2b_lane' => '1', 'bwa_executable' => 'bwa0_6', - 'filtered_bam' => $no_cal_path . '/1234_1.bam', + 'filtered_bam' => "${no_cal_path}/${id_run}_1.bam", 'samtools_executable' => 'samtools', 'i2b_library_name' => '51021', 'outdatadir' => $no_cal_path, 'subsetsubpath' => $no_cal_path . '/archive/lane1/.npg_cache_10000', - 'i2b_run_path' => $dir . q[/nfs/sf45/IL2/analysis/123456_IL2_1234], + 'i2b_run_path' => $runfolder, 'teepot_tempdir' => '.', 'split_prefix' => $no_cal_path, 'i2b_intensity_dir' => $intensities_dir, 'i2b_sample_aliases' => 'SRS000147', 'phix_alignment_method' => 'minimap2', - 'md5filename' => $no_cal_path . '/1234_1.bam.md5', + 'md5filename' => "${no_cal_path}/${id_run}_1.bam.md5", 'teepot_mval' => '2G', - 'i2b_runfolder' => '123456_IL2_1234', + 'i2b_runfolder' => $rf_name, 'i2b_study_name' => '"SRP000031: 1000Genomes Project Pilot 1"', 'i2b_basecalls_dir' => $intensities_dir . '/BaseCalls', 'teepot_wval' => '500', - 'qc_check_qc_in_dir' => $intensities_dir . '/BAM_basecalls_09-07-2009', - 'qc_check_id_run' => '1234', + 'qc_check_qc_in_dir' => $bam_basecall_path, + 'qc_check_id_run' => $id_run, 'cluster_count' => '500077065', - 'seed_frac' => '1234.00002000', + 'seed_frac' => "${id_run}.00002000", 'split_threads_val' => 4, 'aln_filter_value' => '0x900', 's1_se_pe' => 'se', 's1_output_format' => 'cram', 'lane_archive_path' => $no_cal_path . '/archive/lane1', - 'rpt_list' => '1234:1', + 'rpt_list' => "${id_run}:1", }, ], 'ops' => { @@ -309,26 +287,21 @@ subtest 'check_save_arguments_minimap2' => sub { }; is_deeply($h, $expected, 'correct json file content (for p4 stage1 params file)'); - }; - -# check_duplex-seq test +}; subtest 'check_duplex-seq' => sub { plan tests => 29; - my $rf_name = '210111_A00513_0447_AHJ55JDSXY'; - my $rfpath = abs_path(getcwd) . qq{/t/data/novaseq/$rf_name}; - my $copy = join q[/], $dir, $rf_name; - dircopy $rfpath, $copy or die 'Failed to copy run folder'; - $rfpath = $copy; - my $id_run = 36062; + my $rf_name = '210111_A00513_0447_AHJ55JDSXY'; + my $rfpath = join q[/], $dir, $rf_name; + dircopy qq{t/data/novaseq/$rf_name}, $rfpath or die 'Failed to copy run folder'; my $bbp = qq{$rfpath/Data/Intensities/BAM_basecalls_20210113-092146}; local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = qq{$bbp/metadata_cache_36062/samplesheet_36062.csv}; - $bam_generator = npg_pipeline::function::p4_stage1_analysis->new( + my $bam_generator = npg_pipeline::function::p4_stage1_analysis->new( run_folder => $rf_name, repository => $repos_root, runfolder_path => $rfpath, @@ -572,7 +545,7 @@ subtest 'check_duplex-seq' => sub { }; is_deeply($h, $expected, 'correct json file content (for p4 stage1 params file)'); - }; +}; 1; diff --git a/t/20-function-seqchksum_comparator.t b/t/20-function-seqchksum_comparator.t index 645c4d9a..7ff617a8 100644 --- a/t/20-function-seqchksum_comparator.t +++ b/t/20-function-seqchksum_comparator.t @@ -1,39 +1,57 @@ use strict; use warnings; -use Test::More tests => 17; +use Test::More tests => 15; use Test::Exception; use Log::Log4perl qw(:levels); +use File::Path qw(make_path); +use File::Copy; + use t::util; my $util = t::util->new(); my $tmp_dir = $util->temp_directory(); -local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = q[t/data/samplesheet_1234.csv]; -# if REF_PATH is not set, force using ref defined in the header -local $ENV{REF_PATH} = $ENV{REF_PATH} ? $ENV{REF_PATH} : 'DUMMY'; - Log::Log4perl->easy_init({layout => '%d %-5p %c - %m%n', level => $DEBUG, file => join(q[/], $tmp_dir, 'logfile'), utf8 => 1}); -use_ok( q{npg_pipeline::function::seqchksum_comparator} ); +my $test_data_dir_47995 = 't/data/novaseqx/20231017_LH00210_0012_B22FCNFLT3'; + +sub _setup_runfolder_47995 { + my $timestamp = shift; + my @dirs = split q[/], $test_data_dir_47995; + my $rf_name = pop @dirs; + my $rf_info = $util->create_runfolder($tmp_dir, {'runfolder_name' => $rf_name}); + my $rf = $rf_info->{'runfolder_path'}; + for my $file (qw(RunInfo.xml RunParameters.xml)) { + if (copy("$test_data_dir_47995/$file", "$rf/$file") == 0) { + die "Failed to copy $file"; + } + } + my $bam_basecall_path = $rf . "/Data/Intensities/BAM_basecalls_$timestamp"; + my $archive_path = $bam_basecall_path . q{/no_cal/archive}; + make_path($archive_path); + $rf_info->{'bam_basecall_path'} = $bam_basecall_path; + $rf_info->{'archive_path'} = $archive_path; + return $rf_info; +} -$util->create_analysis(); +use_ok( q{npg_pipeline::function::seqchksum_comparator} ); my $timestamp = q{09-07-2009}; -my $analysis_runfolder_path = $util->analysis_runfolder_path(); -my $bam_basecall_path = $analysis_runfolder_path . "/Data/Intensities/BAM_basecalls_$timestamp/"; -my $recalibrated_path = $analysis_runfolder_path. "/Data/Intensities/BAM_basecalls_$timestamp/no_cal"; -my $archive_path = $recalibrated_path . q{/archive}; +my $rf_info = _setup_runfolder_47995($timestamp); +my $archive_path = $rf_info->{'archive_path'}; +my $bam_basecall_path = $rf_info->{'bam_basecall_path'}; my %init = ( - run_folder => q{123456_IL2_1234}, - runfolder_path => $analysis_runfolder_path, + runfolder_path => $rf_info->{'runfolder_path'}, archive_path => $archive_path, bam_basecall_path => $bam_basecall_path, - id_run => 1234, + id_run => 47995, is_indexed => 0, + timestamp => $timestamp, + lanes => [1,2], resource => { default => { minimum_cpu => 1, @@ -43,13 +61,11 @@ my %init = ( ); { + local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = "$test_data_dir_47995/samplesheet_47995.csv"; + my $object; lives_ok { - $object = npg_pipeline::function::seqchksum_comparator->new( - %init, - timestamp => $timestamp, - lanes => [1,2], - ); + $object = npg_pipeline::function::seqchksum_comparator->new(%init); } q{object ok}; isa_ok( $object, q{npg_pipeline::function::seqchksum_comparator}); @@ -60,83 +76,22 @@ my %init = ( is ($d->created_by, q{npg_pipeline::function::seqchksum_comparator}, 'created_by is correct'); is ($d->created_on, $object->timestamp, 'created_on is correct'); - is ($d->identifier, 1234, 'identifier is set correctly'); - ok (!$d->has_composition, 'no composition is not set'); - is ($d->job_name, q{seqchksum_comparator_1234_09-07-2009}, + is ($d->identifier, 47995, 'identifier is set correctly'); + ok (!$d->has_composition, 'composition is not set'); + is ($d->job_name, q{seqchksum_comparator_47995_09-07-2009}, 'job_name is correct'); my $rp = $object->recalibrated_path; is ($d->command, - q{npg_pipeline_seqchksum_comparator --id_run=1234 --lanes=1 --lanes=2} . + q{npg_pipeline_seqchksum_comparator --id_run=47995 --lanes=1 --lanes=2} . qq{ --archive_path=$archive_path --bam_basecall_path=$bam_basecall_path} . - qq{ --input_fofg_name=$rp/1234_input_fofn.txt}, + qq{ --input_fofg_name=$rp/47995_input_fofn.txt}, 'command is correct'); ok (!$d->excluded, 'step not excluded'); is ($d->queue, 'default', 'default queue'); lives_ok {$d->freeze()} 'definition can be serialized to JSON'; - throws_ok{$object->do_comparison()} qr/Failed to change directory/, + throws_ok{$object->do_comparison()} qr/Failed to run command seqchksum_merge.pl/, q{Doing a comparison with no files throws an exception}; - -############# -############# -############# -############# -## my $seqchksum_contents1 = <<'END1'; -## ### set count b_seq name_b_seq b_seq_qual b_seq_tags(BC,FI,QT,RT,TC) -## all all 19821774 3a58186f 29528f13 7bf272c0 30e0b9ef -## all pass 19821774 3a58186f 29528f13 7bf272c0 30e0b9ef -## all 0 1 1 1 1 -## pass 0 1 1 1 1 -## 1#0 all 3865560 4aebf9cb 63f4ad67 3d54f814 5c3f971f -## 1#0 pass 3865560 4aebf9cb 63f4ad67 3d54f814 5c3f971f -## 1#2 all 15956214 504ab7d8 28428e9b 643c096e 3cbf1e96 -## 1#2 pass 15956214 504ab7d8 28428e9b 643c096e 3cbf1e96}; -## END1 -## -## system "mkdir -p $archive_path/lane1"; -## system "cp -p t/data/runfolder/archive/lane1/1234_1#15.cram $archive_path/lane1"; -## -## system "mkdir -p $archive_path/lane2"; -## system "cp -p t/data/runfolder/archive/lane1/1234_1#15.cram $archive_path/lane2/1234_2#15.cram"; -## system "cp -p t/data/runfolder/archive/lane1/1234_1#15.seqchksum $archive_path/lane2/1234_2#15.seqchksum"; -## -## open my $seqchksum_fh1, '>', "$bam_basecall_path/1234_1.post_i2b.seqchksum" or die "Cannot open file for writing"; -## print $seqchksum_fh1 $seqchksum_contents1 or die $!; -## close $seqchksum_fh1 or die $!; -## -## SKIP: { -## skip 'no tools', 2 if ((not $ENV{TOOLS_INSTALLED}) and (system(q(which bamseqchksum)) or system(q(which scramble)))); -## TODO: { local $TODO= q(scramble doesn't through an exception when converting an empty bam file to cram it just writes a cram files with a @PG ID:scramble .. line); -## throws_ok{$object->do_comparison()} qr/Failed to run command bamcat /, q{Doing a comparison with empty bam files throws an exception}; -## } -## -## system "cp -p t/data/seqchksum/sorted.cram $archive_path/lane1/1234_1#15.cram"; -## system "cp -p t/data/seqchksum/sorted.cram $archive_path/lane2/1234_2#15.cram"; -## -## throws_ok { $object->do_comparison() } -## qr/seqchksum for post_i2b and product are different/, -## q{Doing a comparison with different bam files throws an exception}; -## } -############# -############# -############# -############# -} - -{ - my $object = npg_pipeline::function::seqchksum_comparator->new( - %init, - lanes => [1], - ); - my $da = $object->create(); - ok ($da && @{$da} == 1, 'an array with one definitions is returned'); - - $object = npg_pipeline::function::seqchksum_comparator->new( - %init - ); - $da = $object->create(); - # seqchksum_comparator is now a run-level function, so only one definition returned - ok ($da && @{$da} == 1, 'an array with one definition is returned for eight lanes'); } 1; diff --git a/t/20-function-start_stop.t b/t/20-function-start_stop.t index 62a8bbaa..abe0f577 100644 --- a/t/20-function-start_stop.t +++ b/t/20-function-start_stop.t @@ -1,14 +1,13 @@ use strict; use warnings; use Test::More tests => 3; -use Test::Exception; -use File::Copy; -use File::Basename; use t::util; my $util = t::util->new(); -my $runfolder_path = $util->analysis_runfolder_path(); +my $temp_dir = join q[/], $util->temp_directory(), 'analysis'; +my $rf_info = $util->create_runfolder($temp_dir); +my $runfolder_path = $rf_info->{'runfolder_path'}; use_ok('npg_pipeline::function::start_stop'); diff --git a/t/data/bam_flagstats/1234_1_bam_flagstats.json b/t/data/bam_flagstats/1234_1_bam_flagstats.json deleted file mode 100644 index 649bfbad..00000000 --- a/t/data/bam_flagstats/1234_1_bam_flagstats.json +++ /dev/null @@ -1 +0,0 @@ -{"paired_mapped_reads":10,"unpaired_read_duplicates":0,"position":"1","library":"170111_A1_kapaoriginal 1671118","histogram":{},"paired_read_duplicates":0,"info":{"Samtools":"0.1.11 (r851)","Picard-tools":"1.36"},"mate_mapped_defferent_chr":0,"unmapped_reads":7,"unpaired_mapped_reads":3,"id_run":"1234","__CLASS__":"npg_qc::autoqc::results::bam_flagstats-9824","proper_mapped_pair":0,"mate_mapped_defferent_chr_5":0,"read_pair_optical_duplicates":0,"human_split":"all"} diff --git a/t/data/bam_flagstats/1234_3_bam_flagstats.json b/t/data/bam_flagstats/1234_3_bam_flagstats.json deleted file mode 100644 index f7818f12..00000000 --- a/t/data/bam_flagstats/1234_3_bam_flagstats.json +++ /dev/null @@ -1 +0,0 @@ -{"paired_mapped_reads":4,"unpaired_read_duplicates":0,"position":"3","library":"170111_A1_kapaoriginal 1671118","histogram":{},"paired_read_duplicates":0,"info":{"Samtools":"0.1.11 (r851)","Picard-tools":"1.36"},"mate_mapped_defferent_chr":0,"unmapped_reads":6,"unpaired_mapped_reads":2,"id_run":"1234","__CLASS__":"npg_qc::autoqc::results::bam_flagstats-9824","proper_mapped_pair":0,"mate_mapped_defferent_chr_5":0,"read_pair_optical_duplicates":0,"human_split":"all"} diff --git a/t/data/bam_flagstats/1234_3_phix_bam_flagstats.json b/t/data/bam_flagstats/1234_3_phix_bam_flagstats.json deleted file mode 100644 index 64cf1a78..00000000 --- a/t/data/bam_flagstats/1234_3_phix_bam_flagstats.json +++ /dev/null @@ -1 +0,0 @@ -{"paired_mapped_reads":4,"unpaired_read_duplicates":0,"position":"3","library":"170111_A1_kapaoriginal 1671118","histogram":{},"paired_read_duplicates":0,"info":{"Samtools":"0.1.11 (r851)","Picard-tools":"1.36"},"mate_mapped_defferent_chr":0,"unmapped_reads":2,"unpaired_mapped_reads":4,"id_run":"1234","__CLASS__":"npg_qc::autoqc::results::bam_flagstats-9824","proper_mapped_pair":0,"mate_mapped_defferent_chr_5":0,"read_pair_optical_duplicates":0,"human_split":"phix"} diff --git a/t/data/bam_flagstats/1234_4_bam_flagstats.json b/t/data/bam_flagstats/1234_4_bam_flagstats.json deleted file mode 100644 index cca799d1..00000000 --- a/t/data/bam_flagstats/1234_4_bam_flagstats.json +++ /dev/null @@ -1 +0,0 @@ -{"paired_mapped_reads":10,"unpaired_read_duplicates":0,"position":"4","library":"170111_A1_kapaoriginal 1671118","histogram":{},"paired_read_duplicates":0,"info":{"Samtools":"0.1.11 (r851)","Picard-tools":"1.36"},"mate_mapped_defferent_chr":0,"unmapped_reads":7,"unpaired_mapped_reads":3,"id_run":"1234","__CLASS__":"npg_qc::autoqc::results::bam_flagstats-9824","proper_mapped_pair":0,"mate_mapped_defferent_chr_5":0,"read_pair_optical_duplicates":0,"human_split":"all"} diff --git a/t/data/bam_flagstats/47995_3_bam_flagstats.json b/t/data/bam_flagstats/47995_3_bam_flagstats.json new file mode 100644 index 00000000..7eba2ac9 --- /dev/null +++ b/t/data/bam_flagstats/47995_3_bam_flagstats.json @@ -0,0 +1,21 @@ +{ + "paired_mapped_reads": 4, + "unpaired_read_duplicates": 0, + "position": "3", + "library": "170111_A1_kapaoriginal 1671118", + "histogram": {}, + "paired_read_duplicates": 0, + "info": { + "Samtools": "0.1.11 (r851)", + "Picard-tools": "1.36" + }, + "mate_mapped_defferent_chr": 0, + "unmapped_reads": 6, + "unpaired_mapped_reads": 2, + "id_run": "47995", + "__CLASS__": "npg_qc::autoqc::results::bam_flagstats-9824", + "proper_mapped_pair": 0, + "mate_mapped_defferent_chr_5": 0, + "read_pair_optical_duplicates": 0, + "human_split": "all" +} diff --git a/t/data/bam_flagstats/47995_3_phix_bam_flagstats.json b/t/data/bam_flagstats/47995_3_phix_bam_flagstats.json new file mode 100644 index 00000000..229174a2 --- /dev/null +++ b/t/data/bam_flagstats/47995_3_phix_bam_flagstats.json @@ -0,0 +1,21 @@ +{ + "paired_mapped_reads": 4, + "unpaired_read_duplicates": 0, + "position": "3", + "library": "170111_A1_kapaoriginal 1671118", + "histogram": {}, + "paired_read_duplicates": 0, + "info": { + "Samtools": "0.1.11 (r851)", + "Picard-tools": "1.36" + }, + "mate_mapped_defferent_chr": 0, + "unmapped_reads": 2, + "unpaired_mapped_reads": 4, + "id_run": "47995", + "__CLASS__": "npg_qc::autoqc::results::bam_flagstats-9824", + "proper_mapped_pair": 0, + "mate_mapped_defferent_chr_5": 0, + "read_pair_optical_duplicates": 0, + "human_split": "phix" +} diff --git a/t/data/miseq/24347_RunInfo.xml b/t/data/miseq/24347_RunInfo.xml new file mode 100755 index 00000000..00f9a978 --- /dev/null +++ b/t/data/miseq/24347_RunInfo.xml @@ -0,0 +1,14 @@ + + + + 000000000-BF5NJ + M02069 + 171114 + + + + + + + + diff --git a/t/data/p4_stage1_analysis/1234_samplesheet.csv b/t/data/p4_stage1_analysis/1234_samplesheet.csv deleted file mode 100644 index 6069d6ad..00000000 --- a/t/data/p4_stage1_analysis/1234_samplesheet.csv +++ /dev/null @@ -1,27 +0,0 @@ -[Header],,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, -Investigator Name,pav,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, -Project Name,1000Genomes-A1-YRI,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, -Experiment Name,1234,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, -Date,2008-08-17T13:18:30,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, -Workflow,LibraryQC,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, -Chemistry,Default,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, -,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, -[Reads],,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, -37,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, -37,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, -[Settings],,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, -,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, -[Manifests],,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, -,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, -[Data],,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, -Index,Lane,Sample_ID,Sample_Name,GenomeFolder,bait_name,default_library_type,default_tag_sequence,email_addresses,email_addresses_of_followers,email_addresses_of_managers,email_addresses_of_owners,is_control,is_pool,lane_id,lane_priority,library_name,organism,organism_taxon_id,project_cost_code,project_id,project_name,qc_state,request_id,required_insert_size_range,sample_accession_number,sample_common_name,sample_consent_withdrawn,sample_description,sample_id,sample_name,sample_public_name,sample_reference_genome,spiked_phix_tag_index,study_accession_number,study_alignments_in_bam,study_contains_nonconsented_human,study_contains_nonconsented_xahuman,study_description,study_id,study_name,study_reference_genome,study_separate_y_chromosome_data,study_title,tag_index, -,1,51021,SRS000147,C:\Illumina\MiSeq Reporter\Genomes\WTSI_references\Homo_sapiens\1000Genomes\all\fasta\,,,,jws@sanger.ac.uk rd@sanger.ac.uk thomas.keane@sanger.ac.uk,jws@sanger.ac.uk thomas.keane@sanger.ac.uk,jws@sanger.ac.uk,rd@sanger.ac.uk,0,0,66206,0,NA18907-YRI-1,human,9606,S0706,185,1000Genomes-A1-YRI,pending,2409,from:150 to:200,SRS000147,Homo sapiens,,,766,NA18907-YRI-1,NA18907,,,SRP000031,1,0,0,1000Genomes Project Pilot 1,185,1000Genomes-A1-YRI,,,1000Genomes Project Pilot 1,, -,2,51021,SRS000147,C:\Illumina\MiSeq Reporter\Genomes\WTSI_references\Homo_sapiens\1000Genomes\all\fasta\,,,,jws@sanger.ac.uk rd@sanger.ac.uk thomas.keane@sanger.ac.uk,jws@sanger.ac.uk thomas.keane@sanger.ac.uk,jws@sanger.ac.uk,rd@sanger.ac.uk,0,0,66207,0,NA18907-YRI-1,human,9606,S0706,185,1000Genomes-A1-YRI,pending,2410,from:150 to:200,SRS000147,Homo sapiens,,,766,NA18907-YRI-1,NA18907,,,SRP000031,1,0,0,1000Genomes Project Pilot 1,185,1000Genomes-A1-YRI,,,1000Genomes Project Pilot 1,, -,3,51021,SRS000147,C:\Illumina\MiSeq Reporter\Genomes\WTSI_references\Homo_sapiens\1000Genomes\all\fasta\,,,,jws@sanger.ac.uk rd@sanger.ac.uk thomas.keane@sanger.ac.uk,jws@sanger.ac.uk thomas.keane@sanger.ac.uk,jws@sanger.ac.uk,rd@sanger.ac.uk,0,0,66208,0,NA18907-YRI-1,human,9606,S0706,185,1000Genomes-A1-YRI,pending,2411,from:150 to:200,SRS000147,Homo sapiens,,,766,NA18907-YRI-1,NA18907,,,SRP000031,1,0,0,1000Genomes Project Pilot 1,185,1000Genomes-A1-YRI,,,1000Genomes Project Pilot 1,, -,4,79570,phiX_SI_SPRI,C:\Illumina\MiSeq Reporter\Genomes\WTSI_references\PhiX\Sanger-SNPs\all\fasta\,,,,,,,,1,0,80723,0,phiX_SI_SPRI,,,,,,,41944,,,,,,9829,phiX_SI_SPRI,,,,,,0,0,,,,,,,, -,5,51021,SRS000147,C:\Illumina\MiSeq Reporter\Genomes\WTSI_references\Homo_sapiens\1000Genomes\all\fasta\,,,,jws@sanger.ac.uk rd@sanger.ac.uk thomas.keane@sanger.ac.uk,jws@sanger.ac.uk thomas.keane@sanger.ac.uk,jws@sanger.ac.uk,rd@sanger.ac.uk,0,0,66209,0,NA18907-YRI-1,human,9606,S0706,185,1000Genomes-A1-YRI,pending,2412,from:150 to:200,SRS000147,Homo sapiens,,,766,NA18907-YRI-1,NA18907,,,SRP000031,1,0,0,1000Genomes Project Pilot 1,185,1000Genomes-A1-YRI,,,1000Genomes Project Pilot 1,, -,6,51021,SRS000147,C:\Illumina\MiSeq Reporter\Genomes\WTSI_references\Homo_sapiens\1000Genomes\all\fasta\,,,,jws@sanger.ac.uk rd@sanger.ac.uk thomas.keane@sanger.ac.uk,jws@sanger.ac.uk thomas.keane@sanger.ac.uk,jws@sanger.ac.uk,rd@sanger.ac.uk,0,0,66210,0,NA18907-YRI-1,human,9606,S0706,185,1000Genomes-A1-YRI,pending,2413,from:150 to:200,SRS000147,Homo sapiens,,,766,NA18907-YRI-1,NA18907,,,SRP000031,1,0,0,1000Genomes Project Pilot 1,185,1000Genomes-A1-YRI,,,1000Genomes Project Pilot 1,, -,7,51021,SRS000147,C:\Illumina\MiSeq Reporter\Genomes\WTSI_references\Homo_sapiens\1000Genomes\all\fasta\,,,,jws@sanger.ac.uk rd@sanger.ac.uk thomas.keane@sanger.ac.uk,jws@sanger.ac.uk thomas.keane@sanger.ac.uk,jws@sanger.ac.uk,rd@sanger.ac.uk,0,0,66211,0,NA18907-YRI-1,human,9606,S0706,185,1000Genomes-A1-YRI,pending,2414,from:150 to:200,SRS000147,Homo sapiens,,,766,NA18907-YRI-1,NA18907,,,SRP000031,1,0,0,1000Genomes Project Pilot 1,185,1000Genomes-A1-YRI,,,1000Genomes Project Pilot 1,, -ATCAACCG,8,51021,SRS000147,C:\Illumina\MiSeq Reporter\Genomes\WTSI_references\Homo_sapiens\1000Genomes\all\fasta\,,,ATCAACCG,jws@sanger.ac.uk rd@sanger.ac.uk thomas.keane@sanger.ac.uk,jws@sanger.ac.uk thomas.keane@sanger.ac.uk,jws@sanger.ac.uk,rd@sanger.ac.uk,0,1,66212,0,NA18907-YRI-1,human,9606,S0706,185,1000Genomes-A1-YRI,pending,2415,from:150 to:200,SRS000147,Homo sapiens,,,766,NA18907-YRI-1,NA18907,,,SRP000031,1,0,0,1000Genomes Project Pilot 1,185,1000Genomes-A1-YRI,,,1000Genomes Project Pilot 1,154, -TAGTTGGC,8,51022,SRS000148,C:\Illumina\MiSeq Reporter\Genomes\WTSI_references\Homo_sapiens\1000Genomes\all\fasta\,,,TAGTTGGC,jws@sanger.ac.uk rd@sanger.ac.uk thomas.keane@sanger.ac.uk,jws@sanger.ac.uk thomas.keane@sanger.ac.uk,jws@sanger.ac.uk,rd@sanger.ac.uk,0,1,66212,0,NA18908-YRI-1,human,9606,S0706,185,1000Genomes-A1-YRI,pending,2415,from:150 to:200,SRS000148,Homo sapiens,,,767,NA18908-YRI-1,NA18908,,,SRP000031,1,0,0,1000Genomes Project Pilot 1,185,1000Genomes-A1-YRI,,,1000Genomes Project Pilot 1,155, - diff --git a/t/data/p4_stage1_analysis/samplesheet.csv b/t/data/p4_stage1_analysis/samplesheet.csv new file mode 100644 index 00000000..41a6d28f --- /dev/null +++ b/t/data/p4_stage1_analysis/samplesheet.csv @@ -0,0 +1,4 @@ +[Data],,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, +Index,Lane,Sample_ID,Sample_Name,GenomeFolder,bait_name,default_library_type,default_tag_sequence,email_addresses,email_addresses_of_followers,email_addresses_of_managers,email_addresses_of_owners,is_control,is_pool,lane_id,lane_priority,library_name,organism,organism_taxon_id,project_cost_code,project_id,project_name,qc_state,request_id,required_insert_size_range,sample_accession_number,sample_common_name,sample_consent_withdrawn,sample_description,sample_id,sample_name,sample_public_name,sample_reference_genome,spiked_phix_tag_index,study_accession_number,study_alignments_in_bam,study_contains_nonconsented_human,study_contains_nonconsented_xahuman,study_description,study_id,study_name,study_reference_genome,study_separate_y_chromosome_data,study_title,tag_index, +,1,51021,SRS000147,C:\Illumina\MiSeq Reporter\Genomes\WTSI_references\Homo_sapiens\1000Genomes\all\fasta\,,,,jws@sanger.ac.uk rd@sanger.ac.uk thomas.keane@sanger.ac.uk,jws@sanger.ac.uk thomas.keane@sanger.ac.uk,jws@sanger.ac.uk,rd@sanger.ac.uk,0,0,66206,0,NA18907-YRI-1,human,9606,S0706,185,1000Genomes-A1-YRI,pending,2409,from:150 to:200,SRS000147,Homo sapiens,,,766,NA18907-YRI-1,NA18907,,,SRP000031,1,0,0,1000Genomes Project Pilot 1,185,1000Genomes-A1-YRI,,,1000Genomes Project Pilot 1,, + diff --git a/t/data/runfolder/Data/RunInfo.xml b/t/data/runfolder/Data/RunInfo.xml deleted file mode 100644 index c4d34c91..00000000 --- a/t/data/runfolder/Data/RunInfo.xml +++ /dev/null @@ -1,17 +0,0 @@ - - - - D0W73ACXX - SN510 - 120611 - - - - - - - - 4 - - - \ No newline at end of file diff --git a/t/util.pm b/t/util.pm index a6e20827..2f7755b9 100644 --- a/t/util.pm +++ b/t/util.pm @@ -1,12 +1,10 @@ package t::util; use Moose; -use File::Temp qw{ tempdir }; +use File::Temp qw(tempdir); use Readonly; use File::Path qw(make_path); -Readonly::Scalar my $NFS_STAGING_DISK => q{/nfs/sf45}; - has q{temp_directory} => ( isa => q{Str}, is => q{ro}, @@ -24,47 +22,6 @@ has q{clean_temp_directory} => ( default => 1, ); -############### -# path setups - -Readonly::Scalar our $DEFAULT_RUNFOLDER => q{123456_IL2_1234}; -Readonly::Scalar our $ANALYSIS_RUNFOLDER_PATH => $NFS_STAGING_DISK . q{/IL2/analysis/} . $DEFAULT_RUNFOLDER; -Readonly::Scalar our $BBCALLS_PATH => qq{$ANALYSIS_RUNFOLDER_PATH/Data/Intensities/BAM_basecalls_09-07-2009}; -Readonly::Scalar our $RECALIBRATED_PATH => qq{$BBCALLS_PATH/no_cal}; - -sub analysis_runfolder_path { - my ( $self ) = @_; - return $self->temp_directory() . $ANALYSIS_RUNFOLDER_PATH; -} - -sub standard_bam_basecall_path { - my ( $self ) = @_; - return $self->temp_directory() . $BBCALLS_PATH; -} - -sub standard_analysis_recalibrated_path { - my ( $self ) = @_; - return $self->temp_directory() . $RECALIBRATED_PATH; -} - -sub create_analysis { - my ($self) = @_; - - my $staging = $self->temp_directory() . $NFS_STAGING_DISK; - `rm -rf $staging`; - - my $analysis_runfolder_path = $self->temp_directory() . $ANALYSIS_RUNFOLDER_PATH; - my $recalibrated_path = $self->temp_directory() . $RECALIBRATED_PATH; - `mkdir -p $recalibrated_path`; - `ln -s Data/Intensities/BAM_basecalls_09-07-2009/no_cal $analysis_runfolder_path/Latest_Summary`; - `mkdir -p $analysis_runfolder_path/InterOp`; - `cp t/data/p4_stage1_analysis/TileMetricsOut.bin $analysis_runfolder_path/InterOp`; - `cp t/data/run_params/runParameters.miseq.xml $analysis_runfolder_path/runParameters.xml`; - $self->_create_run_info(); - - return; -} - sub create_runfolder { my ($self, $dir, $names) = @_; @@ -88,25 +45,4 @@ sub create_runfolder { return $paths; } -sub _create_run_info { - my ($self) = @_; - - my $fh; - my $runinfofile = $self->analysis_runfolder_path() . q[/RunInfo.xml]; - open($fh, '>', $runinfofile) or die "Could not open file '$runinfofile' $!"; - print $fh <<"ENDXML"; - - - - - - - - - - -ENDXML - close $fh; -} - 1; From 1297dc0008877f1e4bc07ee8a8f3d58f053632d8 Mon Sep 17 00:00:00 2001 From: Marina Gourtovaia Date: Wed, 15 May 2024 12:38:12 +0100 Subject: [PATCH 05/10] Removed unused test data --- MANIFEST | 10 ---------- t/data/runfolder/archive/1234_2.bam | 0 t/data/runfolder/archive/1234_2_human.bam | 0 t/data/runfolder/archive/1234_3.bam | 0 t/data/runfolder/archive/lane1/1234_1#15.bam | 0 t/data/runfolder/archive/lane1/1234_1#15.cram | Bin 209 -> 0 bytes .../runfolder/archive/lane1/1234_1#15.seqchksum | 3 --- .../runfolder/archive/lane1/1234_1#15_human.bam | 0 t/data/runfolder/archive/lane1/1234_1#15_phix.bam | 0 t/data/runfolder/archive/lane4/1234_4#16.bam | 0 t/data/runfolder/archive/lane4/1234_4#32.bam | 0 11 files changed, 13 deletions(-) delete mode 100644 t/data/runfolder/archive/1234_2.bam delete mode 100644 t/data/runfolder/archive/1234_2_human.bam delete mode 100644 t/data/runfolder/archive/1234_3.bam delete mode 100644 t/data/runfolder/archive/lane1/1234_1#15.bam delete mode 100644 t/data/runfolder/archive/lane1/1234_1#15.cram delete mode 100644 t/data/runfolder/archive/lane1/1234_1#15.seqchksum delete mode 100644 t/data/runfolder/archive/lane1/1234_1#15_human.bam delete mode 100644 t/data/runfolder/archive/lane1/1234_1#15_phix.bam delete mode 100644 t/data/runfolder/archive/lane4/1234_4#16.bam delete mode 100644 t/data/runfolder/archive/lane4/1234_4#32.bam diff --git a/MANIFEST b/MANIFEST index 40045723..c1334cdb 100644 --- a/MANIFEST +++ b/MANIFEST @@ -1060,16 +1060,6 @@ t/data/run_params/runParameters.hiseqx.upgraded.xml t/data/run_params/runParameters.miseq.xml t/data/run_params/runParameters.novaseq.xml t/data/run_params/RunParameters.novaseqx.xml -t/data/runfolder/archive/1234_2.bam -t/data/runfolder/archive/1234_2_human.bam -t/data/runfolder/archive/1234_3.bam -t/data/runfolder/archive/lane1/1234_1#15.bam -t/data/runfolder/archive/lane1/1234_1#15.cram -t/data/runfolder/archive/lane1/1234_1#15.seqchksum -t/data/runfolder/archive/lane1/1234_1#15_human.bam -t/data/runfolder/archive/lane1/1234_1#15_phix.bam -t/data/runfolder/archive/lane4/1234_4#16.bam -t/data/runfolder/archive/lane4/1234_4#32.bam t/data/samplesheet_1234.csv t/data/samplesheet_8747.csv t/data/samplesheet_33990.csv diff --git a/t/data/runfolder/archive/1234_2.bam b/t/data/runfolder/archive/1234_2.bam deleted file mode 100644 index e69de29b..00000000 diff --git a/t/data/runfolder/archive/1234_2_human.bam b/t/data/runfolder/archive/1234_2_human.bam deleted file mode 100644 index e69de29b..00000000 diff --git a/t/data/runfolder/archive/1234_3.bam b/t/data/runfolder/archive/1234_3.bam deleted file mode 100644 index e69de29b..00000000 diff --git a/t/data/runfolder/archive/lane1/1234_1#15.bam b/t/data/runfolder/archive/lane1/1234_1#15.bam deleted file mode 100644 index e69de29b..00000000 diff --git a/t/data/runfolder/archive/lane1/1234_1#15.cram b/t/data/runfolder/archive/lane1/1234_1#15.cram deleted file mode 100644 index 7d3d1131bc4c19ca0fc7cbc5b484990be54d9349..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 209 zcmZ<`a`a_p&}F~`^>863CI Date: Fri, 10 May 2024 09:15:44 +0100 Subject: [PATCH 06/10] Cache process_separately_lanes value and reuse. If the value of this attrbute is set (in real life scenario via command line arguments to pipeline scripts), it is cached by the analysis pipelne only. If the value is not set, an attempt to retrieve a cached value is made. This ensures that the archival pipeline and the npg_run_is_deletable script run correctly. --- lib/npg_pipeline/base.pm | 67 ++++++++- lib/npg_pipeline/pluggable.pm | 5 +- lib/npg_pipeline/pluggable/central.pm | 48 +++++- lib/npg_pipeline/runfolder_scaffold.pm | 13 +- t/10-base.t | 194 +++++++++++++++++++++---- t/10-pluggable-central.t | 111 +++++++++++++- t/10-runfolder_scaffold.t | 11 +- 7 files changed, 400 insertions(+), 49 deletions(-) diff --git a/lib/npg_pipeline/base.pm b/lib/npg_pipeline/base.pm index cd2841a4..a9aae784 100644 --- a/lib/npg_pipeline/base.pm +++ b/lib/npg_pipeline/base.pm @@ -7,7 +7,10 @@ use POSIX qw(strftime); use Math::Random::Secure qw{irand}; use List::MoreUtils qw{any uniq}; use File::Basename; +use JSON; +use Perl6::Slurp; use Readonly; +use Try::Tiny; use npg_tracking::glossary::rpt; use npg_tracking::glossary::composition::factory::rpt_list; @@ -23,6 +26,7 @@ with qw{ WTSI::DNAP::Utilities::Loggable npg_tracking::util::pipeline_config npg_pipeline::base::options + npg_pipeline::runfolder_scaffold }; Readonly::Array my @NO_SCRIPT_ARG_ATTRS => qw/ @@ -234,12 +238,13 @@ sub _build_merge_by_library { =head2 process_separately_lanes -An array of lane (position) numbers, which should not be merged with anyother +An array of lane (position) numbers, which should not be merged with any other lanes. To be used in conjunction with C or C -attributes. Does not have any impact if both of these attributes are false. +attributes. A consistency check is triggered when the value is set in order +to prevent this setting to be cached if no merge is intended. Defaults to an empty array value, meaning that all possible entities will be -merged. +merged. =cut @@ -247,9 +252,18 @@ has q{process_separately_lanes} => ( isa => q{ArrayRef}, is => q{ro}, default => sub { return []; }, + trigger => \&_validate_process_separately_lanes, documentation => q{Array of lane numbers, which have to be excluded from } . q{a merge}, ); +sub _validate_process_separately_lanes { + my ($self, $new_value) = @_; + if (!$self->merge_lanes && !$self->merge_by_library && (@{$new_value} != 0)) { + $self->logcroak('One of merge options should be enabled if ' . + 'process_separately_lanes is set'); + } + return; +} =head2 lims @@ -348,6 +362,21 @@ zero products, hashed under the 'data_products' key. If product_rpt_list attribute is set, the 'lanes' key maps to an empty array. +While computing the lists of data products, we examine whether data in any +of the lanes can be merged across lanes. Some of the lanes might be explicitly +excluded from the merge by setting the `process_separately_lanes` attribute +from the command line. This is likely to be done when the analysis pipeline +is run manually. Then the same lanes have to be excluded from the merge by +the archival pipeline and by the script that evaluates whether the run folder +can be deleted. To enable this, the value of the `process_separately_lanes` +attribute is saved to the metadate_cache_ directory immediately after +the pipeline establishes the location of the samplesheet file or generates a +new samplesheet. + +This method looks at the `process_separately_lanes` attribute first. If the +`process_separately_lanes` array is empty, an attempt to retrieve the cached +value is made. + =cut has q{products} => ( @@ -373,9 +402,14 @@ sub _build_products { } if ($self->merge_lanes || $self->merge_by_library) { + my $attr_name = 'process_separately_lanes'; + my $separate_lanes = $self->$attr_name; + if (@{$separate_lanes} == 0) { + $separate_lanes = $self->_cached_process_separately_lanes($attr_name); + } my $all_lims = $self->lims->aggregate_libraries( - \@lane_lims, $self->process_separately_lanes); + \@lane_lims, $separate_lanes); @data_lims = @{$all_lims->{'singles'}}; # Might be empty. # merge_lanes option implies a merge across all lanes. @@ -483,6 +517,27 @@ sub _check_lane_merge_is_viable { return 1; } +sub _cached_process_separately_lanes { + my ($self, $key) = @_; + $key or $self->logcroak('Key should be defined'); + + my $path = $self->analysis_options_file_path(); + if (-f $path) { + my $options; + try { + $options = decode_json(slurp($path)); + } catch { + $self->logcroak("Error reading or parsing ${path} : $_"); + }; + if ($options->{$key}) { + $self->info("Found $key analysis option in $path: " . + join q[, ], @{$options->{$key}}); + return $options->{$key}; + } + } + return []; +} + __PACKAGE__->meta->make_immutable; 1; @@ -511,6 +566,8 @@ __END__ =item File::Basename +=item JSON + =item Readonly =item npg_tracking::glossary::rpt @@ -538,7 +595,7 @@ Marina Gourtovaia =head1 LICENSE AND COPYRIGHT -Copyright (C) 2014,2015,2016,2017,2018,2019,2020,2023 Genome Research Ltd. +Copyright (C) 2014,2015,2016,2017,2018,2019,2020,2023,2024 Genome Research Ltd. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/lib/npg_pipeline/pluggable.pm b/lib/npg_pipeline/pluggable.pm index bbb88915..6d4ebd3e 100644 --- a/lib/npg_pipeline/pluggable.pm +++ b/lib/npg_pipeline/pluggable.pm @@ -22,8 +22,7 @@ use npg_pipeline::pluggable::registry; extends q{npg_pipeline::base}; with qw{ MooseX::AttributeCloner - npg_pipeline::executor::options - npg_pipeline::runfolder_scaffold }; + npg_pipeline::executor::options }; our $VERSION = '0'; @@ -883,7 +882,7 @@ __END__ =head1 LICENSE AND COPYRIGHT -Copyright (C) 2014,2015,2016,2017,2018,2019,2020,2021 Genome Research Ltd. +Copyright (C) 2014,2015,2016,2017,2018,2019,2020,2021,2024 Genome Research Ltd. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/lib/npg_pipeline/pluggable/central.pm b/lib/npg_pipeline/pluggable/central.pm index 5945502f..ad5d512b 100644 --- a/lib/npg_pipeline/pluggable/central.pm +++ b/lib/npg_pipeline/pluggable/central.pm @@ -3,6 +3,8 @@ package npg_pipeline::pluggable::central; use Moose; use MooseX::StrictConstructor; use namespace::autoclean; +use JSON; +use File::Slurp qw(read_file write_file); extends 'npg_pipeline::pluggable'; @@ -29,13 +31,17 @@ Pipeline runner for the analysis pipeline. Inherits from parent's method. Sets all paths needed during the lifetime of the analysis runfolder. Creates any of the paths that do not exist. +Saves lane numbers given by the `process_separately_lanes` option to a +JSON file. + =cut override 'prepare' => sub { my $self = shift; $self->_scaffold('create_top_level'); - super(); # Corect order + super(); # Correct order, sets up a samplesheet. + $self->_save_merge_options(); $self->_scaffold('create_product_level'); return; @@ -56,6 +62,40 @@ sub _scaffold { return; } +sub _save_merge_options { + my $self = shift; + + my $attr_name = 'process_separately_lanes'; + my @given_lanes = sort {$a <=> $b} @{$self->$attr_name}; + if (@given_lanes) { + my $cached_options = {}; + my $found = 0; + my $path = $self->analysis_options_file_path(); + if (-f $path) { + $cached_options = decode_json(read_file($path)); + if ($cached_options->{$attr_name} && @{$cached_options->{$attr_name}}) { + my $sep = q[, ]; + my $cached_lanes = join $sep, @{$cached_options->{$attr_name}}; + $self->info("Found cached merge options in $path: " . + "lanes $cached_lanes should not be merged."); + if ($cached_lanes ne join $sep, @given_lanes) { + $self->logcroak('Lane list from process_separately_lanes attribute ' . + 'is inconsistent with cached value'); + } + $found = 1; + } + } + + if (!$found) { + $cached_options->{$attr_name} = \@given_lanes; + write_file($path, encode_json($cached_options)) or + $self->logcroak("Failed to write to $path"); + } + } + + return; +} + __PACKAGE__->meta->make_immutable; 1; @@ -76,6 +116,10 @@ __END__ =item namespace::autoclean +=item JSON + +=item File::Slurp + =back =head1 INCOMPATIBILITIES @@ -89,7 +133,7 @@ Marina Gourtovaia =head1 LICENSE AND COPYRIGHT -Copyright (C) 2018 Genome Research Limited +Copyright (C) 2018,2024 Genome Research Ltd. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/lib/npg_pipeline/runfolder_scaffold.pm b/lib/npg_pipeline/runfolder_scaffold.pm index d6bd451c..710576cf 100644 --- a/lib/npg_pipeline/runfolder_scaffold.pm +++ b/lib/npg_pipeline/runfolder_scaffold.pm @@ -15,6 +15,7 @@ Readonly::Scalar my $ANALYSIS_PATH_COMPONENT => q[/analysis/]; Readonly::Scalar my $LOG_DIR_NAME => q[log]; Readonly::Scalar my $STATUS_FILES_DIR_NAME => q[status]; Readonly::Scalar my $METADATA_CACHE_DIR_NAME => q[metadata_cache_]; +Readonly::Scalar my $ANALYSIS_OPTIONS_FILE_NAME => q[analysis_options.json]; Readonly::Scalar my $TILEVIZ_INDEX_DIR_NAME => q[tileviz]; Readonly::Scalar my $TILEVIZ_INDEX_FILE_NAME => q[index.html]; Readonly::Scalar my $IRODS_PUBLISHER_RSART_DIR_NAME => q[irods_publisher_restart_files]; @@ -149,6 +150,11 @@ sub metadata_cache_dir_path { return catdir($apath, $METADATA_CACHE_DIR_NAME . $self->id_run()); } +sub analysis_options_file_path { + my $self = shift; + return catfile($self->metadata_cache_dir_path, $ANALYSIS_OPTIONS_FILE_NAME); +} + sub irods_publisher_rstart_dir_path { my $self = shift; my $apath = $self->analysis_path; @@ -303,6 +309,11 @@ is empty. Can be called both as an instance and a class method. =head2 metadata_cache_dir_path +=head2 analysis_options_file_path + +A full path for a JSON file, which captures line numbers given by the +C pipeline attribute and other analysis options. + =head2 irods_publisher_rstart_dir_path =head2 irods_locations_dir_path @@ -355,7 +366,7 @@ Given a path in analysis directory changes it to outgoing directory. =head1 LICENSE AND COPYRIGHT -Copyright (C) 2018,2019,2020,2022 Genome Research Ltd. +Copyright (C) 2018,2019,2020,2022,2024 Genome Research Ltd. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/t/10-base.t b/t/10-base.t index 5039459d..3431cf80 100644 --- a/t/10-base.t +++ b/t/10-base.t @@ -1,20 +1,24 @@ use strict; use warnings; -use Test::More tests => 8; +use Test::More tests => 10; use Test::Exception; use File::Temp qw(tempdir tempfile); use Cwd qw(getcwd abs_path); use Log::Log4perl qw(:levels); use Moose::Util qw(apply_all_roles); use File::Copy qw(cp); +use File::Slurp qw(read_file write_file); +use JSON; use t::util; my $util = t::util->new(); +my $temp_dir = $util->temp_directory(); +my $log_file = join q[/], $temp_dir, 'logfile'; Log::Log4perl->easy_init({layout => '%d %-5p %c - %m%n', level => $DEBUG, - file => join(q[/], $util->temp_directory(), 'logfile'), + file => $log_file, utf8 => 1}); my $cwd = abs_path(getcwd()); @@ -22,6 +26,11 @@ my $config_dir = $cwd . '/data/config_files'; use_ok(q{npg_pipeline::base}); +sub _generate_rpt { + my ($id_run, $lanes, $tag_index) = @_; + return join q[;], map { join q[:], $id_run, $_, $tag_index } @{$lanes}; +} + subtest 'local flag' => sub { plan tests => 3; @@ -63,6 +72,36 @@ subtest 'repository preexec' => sub { q{correct ref_adapter_pre_exec_string} ); }; +subtest 'label' => sub { + plan tests => 4; + + my $base = npg_pipeline::base->new(id_run => 22); + is ($base->label, '22', 'label defaults to run id'); + $base = npg_pipeline::base->new(id_run => 22, label => '33'); + is ($base->label, '33', 'label as set'); + $base = npg_pipeline::base->new(product_rpt_list => '22:1:33'); + throws_ok { $base->label } + qr/cannot build 'label' attribute, it should be pre-set/, + 'error if label is not preset'; + $base = npg_pipeline::base->new(product_rpt_list => '22:1:33', label => '33'); + is ($base->label, '33', 'label as set'); +}; + +subtest 'error on incompatible merge attributes' => sub { + plan tests => 1; + my $error = 'One of merge options should be enabled if ' . + 'process_separately_lanes is set'; + throws_ok { + $b = npg_pipeline::base->new( + runfolder_path => $temp_dir, + id_run => 999, + merge_lanes => 0, + merge_by_library => 0, + process_separately_lanes => [3,8] + ) + } qr/$error/, $error; +}; + subtest 'products - merging (or not) lanes' => sub { plan tests => 22; @@ -77,7 +116,11 @@ subtest 'products - merging (or not) lanes' => sub { local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = 't/data/products/samplesheet_novaseq4lanes.csv'; cp 't/data/run_params/runParameters.novaseq.xml', "$rf_path/runParameters.xml"; cp 't/data/novaseq/210111_A00513_0447_AHJ55JDSXY/RunInfo.xml', "$rf_path/RunInfo.xml"; - $b = npg_pipeline::base->new(runfolder_path => $rf_path, id_run => 999); + $b = npg_pipeline::base->new( + runfolder_path => $rf_path, + analysis_path => $temp_dir, + id_run => 47995 + ); ok ($b->merge_lanes, 'merge_lanes flag is set'); ok (!$b->_selected_lanes, 'selected_lanes flag is not set'); lives_ok {$products = $b->products} 'products hash created for NovaSeq run'; @@ -120,7 +163,7 @@ subtest 'products - merging (or not) lanes' => sub { }; subtest 'products - merging (or not) libraries' => sub { - plan tests => 423; + plan tests => 421; my $rf_info = $util->create_runfolder(); my $rf_path = $rf_info->{'runfolder_path'}; @@ -136,7 +179,11 @@ subtest 'products - merging (or not) libraries' => sub { # lanes 3, 4 - 10 samples # lanes 5, 6 - 22 samples # lanes 7, 8 - 38 samples - $b = npg_pipeline::base->new(runfolder_path => $rf_path, id_run => $id_run); + $b = npg_pipeline::base->new( + runfolder_path => $rf_path, + analysis_path => $temp_dir, + id_run => $id_run + ); ok($b->merge_by_library, 'merge by library is true for NovaSeqX'); my @lane_products = @{$b->products()->{'lanes'}}; is (@lane_products, 8, 'eight lane products'); @@ -202,7 +249,11 @@ subtest 'products - merging (or not) libraries' => sub { # Expect lanes 3 and 4 merged. $b = npg_pipeline::base->new( - runfolder_path => $rf_path, id_run => $id_run, lanes => [4,8,3]); + runfolder_path => $rf_path, + analysis_path => $temp_dir, + id_run => $id_run, + lanes => [4,8,3] + ); ok($b->merge_by_library, 'merge by library is true for NovaSeqX'); @lane_products = @{$b->products()->{'lanes'}}; @@ -274,37 +325,120 @@ subtest 'products - merging (or not) libraries' => sub { ); @products = @{$b->products()->{'data_products'}}; is (@products, 64, 'number of data products is 64'); +}; + +subtest 'products - retrieve cached merge options' => sub { + plan tests => 13; - $b = npg_pipeline::base->new( - runfolder_path => $rf_path, + my $id_run = 47995; + my $rf_name = q[20231017_LH00210_0012_B22FCNFLT3]; + my $rf_path_test = qq[t/data/novaseqx/$rf_name]; + local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = + qq[$rf_path_test/samplesheet_${id_run}.csv]; + + my $rf_info = $util->create_runfolder($temp_dir, {'runfolder_name' => $rf_name, + 'analysis_path' => 'BAM_basecalls_20240514-111105'}); + my $runfolder_path = $rf_info->{'runfolder_path'}; + for my $name (qw(RunInfo.xml RunParameters.xml)) { + cp("$rf_path_test/$name", "$runfolder_path/$name") or die "Failed to copy $name"; + } + my $bam_basecall_path = $rf_info->{'basecall_path'}; + my $metadata_cache_path = "$bam_basecall_path/metadata_cache_$id_run"; + mkdir $metadata_cache_path; + + # All lanes are spiked. Possible merges: + # lanes 1, 2 - 17 samples + # lanes 3, 4 - 10 samples + # lanes 5, 6 - 22 samples + # lanes 7, 8 - 38 samples + + # Establish the base cases - an inevitable repetition of some of the tests above. + # No merge + my $num_unmerged = 190; + my $b = npg_pipeline::base->new( + runfolder_path => $runfolder_path, + bam_basecall_path => $bam_basecall_path, id_run => $id_run, - lanes => [4,8,3], + merge_lanes => 0, merge_by_library => 0, - process_separately_lanes => [3,8] ); - lives_ok { @products = @{$b->products()->{'data_products'}} } - 'process_separately_lanes is compatible with suppressed merge'; - is (@products, 64, 'number of data products is 64'); -}; + my @products = @{$b->products()->{'data_products'}}; + is (@products, $num_unmerged, "number of unmerged data products is $num_unmerged"); -sub _generate_rpt { - my ($id_run, $lanes, $tag_index) = @_; - return join q[;], map { join q[:], $id_run, $_, $tag_index } @{$lanes}; -} + # Full merge by library + my $num_merged = 103; # Including unmerged spiked-in PhiX and tag0 + $b = npg_pipeline::base->new( + runfolder_path => $runfolder_path, + bam_basecall_path => $bam_basecall_path, + id_run => $id_run, + ); + ok ($b->merge_by_library, 'merge by library is enabled for NovaSeqX'); + @products = @{$b->products()->{'data_products'}}; + is (@products, $num_merged, "number of data products is $num_merged"); + # End of the base case -subtest 'label' => sub { - plan tests => 4; + # Supress the merge explicitly + $b = npg_pipeline::base->new( + runfolder_path => $runfolder_path, + bam_basecall_path => $bam_basecall_path, + id_run => $id_run, + process_separately_lanes => [(1 .. 8)], + ); + ok ($b->merge_by_library, 'merge by library is enabled for NovaSeqX'); + @products = @{$b->products()->{'data_products'}}; + is (@products, $num_unmerged, "number of data products is $num_unmerged"); - my $base = npg_pipeline::base->new(id_run => 22); - is ($base->label, '22', 'label defaults to run id'); - $base = npg_pipeline::base->new(id_run => 22, label => '33'); - is ($base->label, '33', 'label as set'); - $base = npg_pipeline::base->new(product_rpt_list => '22:1:33'); - throws_ok { $base->label } - qr/cannot build 'label' attribute, it should be pre-set/, - 'error if label is not preset'; - $base = npg_pipeline::base->new(product_rpt_list => '22:1:33', label => '33'); - is ($base->label, '33', 'label as set'); + my $cached_options_file = "$metadata_cache_path/analysis_options.json"; + write_file($cached_options_file, 'no JSON'); + $b = npg_pipeline::base->new( + runfolder_path => $runfolder_path, + bam_basecall_path => $bam_basecall_path, + id_run => $id_run, + ); + throws_ok { $b->products() } qr/Error reading or parsing $cached_options_file/, + 'Exception is thrown when the cached data cannot be read'; + + write_file($cached_options_file, encode_json({'option' => 'value'})) + or die "Failed writing to $cached_options_file"; + $b = npg_pipeline::base->new( + runfolder_path => $runfolder_path, + bam_basecall_path => $bam_basecall_path, + id_run => $id_run, + ); + lives_ok { @products = @{$b->products()->{'data_products'}} } + 'no error generating products'; + is (@products, $num_merged, "number of data products is $num_merged"); + + write_file($cached_options_file, + encode_json({'option' => 'value', 'process_separately_lanes' => [(1 .. 8)]})) + or die "Failed writing to $cached_options_file"; + $b = npg_pipeline::base->new( + runfolder_path => $runfolder_path, + bam_basecall_path => $bam_basecall_path, + id_run => $id_run, + ); + lives_ok { @products = @{$b->products()->{'data_products'}} } + 'no error generating products'; + is (@products, $num_unmerged, "number of data products is $num_unmerged"); + my @log_lines = read_file($log_file) or die "Failed to read the log $log_file"; + like ($log_lines[-1], + qr/Found process_separately_lanes analysis option/, 'information is logged'); + + write_file($cached_options_file, + encode_json({'process_separately_lanes' => [1,2,3]})) + or die "Failed writing to $cached_options_file"; + $b = npg_pipeline::base->new( + runfolder_path => $runfolder_path, + bam_basecall_path => $bam_basecall_path, + id_run => $id_run, + ); + my $num_partially_merged = 16 + 22 + 38 + (10+17)*2; + @products = @{$b->products()->{'data_products'}}; + is (@products, $num_partially_merged, + "number of data products is $num_partially_merged"); + @log_lines = read_file($log_file) or die "Failed to read the log $log_file"; + like ($log_lines[-1], qr/option in $cached_options_file: 1, 2, 3/, + 'information is logged'); }; 1; diff --git a/t/10-pluggable-central.t b/t/10-pluggable-central.t index 2afdb079..df016457 100644 --- a/t/10-pluggable-central.t +++ b/t/10-pluggable-central.t @@ -1,11 +1,13 @@ use strict; use warnings; -use Test::More tests => 4; +use Test::More tests => 5; use Test::Exception; use Log::Log4perl qw(:levels); use File::Copy qw(cp); use File::Path qw(make_path); use File::Temp qw(tempdir); +use File::Slurp qw(read_file write_file); +use JSON; use t::util; @@ -43,7 +45,6 @@ sub _setup_runfolder_47995 { return $rf_info; } - my $central = q{npg_pipeline::pluggable::central}; use_ok($central); @@ -69,8 +70,8 @@ subtest 'test object creation' => sub { 'function_order set on creation'); }; -subtest 'execute main()' => sub { - plan tests => 2; +subtest 'execute main() with a merge' => sub { + plan tests => 7; local $ENV{CLASSPATH} = undef; local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = "$test_data_dir_47995/samplesheet_47995.csv"; @@ -81,7 +82,7 @@ subtest 'execute main()' => sub { lives_ok { $pb = $central->new( id_run => 47995, function_order => [qw{qc_qX_yield qc_adapter update_ml_warehouse qc_insert_size}], - lanes => [4], + lanes => [3,4], run_folder => $rf_info->{'runfolder_name'}, runfolder_path => $rf_info->{'runfolder_path'}, function_list => "$config_dir/function_list_central.json", @@ -93,6 +94,106 @@ subtest 'execute main()' => sub { ); } q{no croak on new creation}; lives_ok { $pb->main() } q{no croak running qc->main()}; + + my $rf = $rf_info->{'runfolder_path'}; + my %dirs = + map { $_ => 1 } + map { /(lane.+\z)/ } + grep { -d } + glob "$rf/Data/Intensities/BAM_basecalls_*/no_cal/archive/lane*"; + is (scalar keys %dirs, 3, 'three directories for lanes 3 and 4 are created'); + # Presence of lane3-4 dir indicates that data from lanes 3 and 4 will be merged. + for my $name (qw(lane3 lane4 lane3-4)) { + ok (exists $dirs{$name}, "directory '$name' exists"); + } + my @files = grep { -f } + glob "$rf/Data/Intensities/BAM_basecalls_*/metadata_cache_47995/*.json"; + is (@files, 0, 'No JSON files in the metadata cache directory'); +}; + +subtest 'execute main() with merge supressed' => sub { + plan tests => 13; + + local $ENV{CLASSPATH} = undef; + my $samplesheet = "$test_data_dir_47995/samplesheet_47995.csv"; + my $rf_info = _setup_runfolder_47995(); + my $rf = $rf_info->{'runfolder_path'}; + my $config_dir = 'data/config_files'; + + my $init = { + id_run => 47995, + function_order => [qw{qc_qX_yield qc_adapter update_ml_warehouse qc_insert_size}], + lanes => [3,4], + process_separately_lanes => [4,3], + run_folder => $rf_info->{'runfolder_name'}, + runfolder_path => $rf, + function_list => "$config_dir/function_list_central.json", + id_flowcell_lims => 17089, + no_bsub => 1, + repository => 't/data/sequence', + spider => 0, + product_conf_file_path => $product_config, + }; + local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = $samplesheet; + my $pb = $central->new($init); + lives_ok { $pb->main() } 'no error running qc->main()'; + my $bam_basecall_path = $pb->bam_basecall_path(); + + my %dirs = + map { $_ => 1 } + map { /(lane.+\z)/ } + grep { -d } + glob "$rf/Data/Intensities/BAM_basecalls_*/no_cal/archive/lane*"; + is (scalar keys %dirs, 2, 'two directories for lanes 3 and 4 are created'); + # Absence of lane3-4 dir indicates that data from lanes 3 and 4 will + # not be merged. + for my $name (qw(lane3 lane4)) { + ok (exists $dirs{$name}, "directory '$name' exists"); + } + + my $file_with_cache = "$bam_basecall_path/metadata_cache_47995/analysis_options.json"; + ok (-f $file_with_cache, 'A file with cached no-merge options exists'); + is_deeply (decode_json(read_file($file_with_cache))->{'process_separately_lanes'}, + [3,4], 'no-merge options are correctly cached'); + + # Run once more. Reuse bam_basecalls directory. Expect no change. + local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = $samplesheet; + $init->{'bam_basecall_path'} = $bam_basecall_path; + $pb = $central->new($init); + lives_ok { $pb->main() } 'no error running qc->main() with the same options'; + ok (-f $file_with_cache, 'A file with cached no-merge options is retained'); + + # Run once more with different lanes not to merge. + my $error = 'Lane list from process_separately_lanes attribute is ' . + 'inconsistent with cached value'; + $init->{'process_separately_lanes'} = [8,7]; + $pb = $central->new($init); + throws_ok { $pb->main() } qr/$error/, + 'error running qc->main() with different no-merge options'; + + # The file exists, but the no-merge option is not captured; + local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = $samplesheet; + $init->{'process_separately_lanes'} = [4,3]; + write_file($file_with_cache, + encode_json({'some option' => 'some option value'})); + $pb = $central->new($init); + lives_ok { $pb->main() } 'no error running qc->main()'; + is_deeply (decode_json(read_file($file_with_cache)), + {'some option' => 'some option value', 'process_separately_lanes' => [3,4]}, + 'the no-merged option has been added to the file' + ); + + # The file exists, the no-merge option is captured as an empty list. + write_file($file_with_cache, + encode_json({'option' => 'value', 'process_separately_lanes' => []})); + local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = $samplesheet; + $init->{'process_separately_lanes'} = [3,4]; + $pb = $central->new($init); + lives_ok { $pb->main() } 'no error running qc->main()'; + is_deeply (decode_json(read_file($file_with_cache)), + {'option' => 'value', 'process_separately_lanes' => [3,4]}, + 'the no-merged option has been added to the file' + ); }; subtest 'execute prepare()' => sub { diff --git a/t/10-runfolder_scaffold.t b/t/10-runfolder_scaffold.t index 880d1abf..de30e890 100644 --- a/t/10-runfolder_scaffold.t +++ b/t/10-runfolder_scaffold.t @@ -33,7 +33,7 @@ subtest 'tests for class methods' => sub { }; subtest 'top level scaffold' => sub { - plan tests => 10; + plan tests => 12; my $util = t::util->new(); my $rfh = $util->create_runfolder(); @@ -59,11 +59,16 @@ subtest 'top level scaffold' => sub { ok (-e $dir, 'archive directory created'); ok (-e "$dir/tileviz", 'tileviz index directory created'); ok (-e "$bbc_path/status", 'status directory created'); - ok (-e "$bbc_path/metadata_cache_999", 'metadata cache directory created'); + is ($rfs->metadata_cache_dir_path, "$bbc_path/metadata_cache_999"); + ok (-e $rfs->metadata_cache_dir_path, 'metadata cache directory created'); ok (-e "$bbc_path/irods_publisher_restart_files", 'directory for iRODS publisher restart files created'); ok (-e "$bbc_path/irods_locations_files", - 'directory for iRODS location json files created') + 'directory for iRODS location json files created'); + + is ($rfs->analysis_options_file_path, + "$bbc_path/metadata_cache_999/analysis_options.json", + 'file path for the analysis options cache') }; subtest 'product level scaffold, NovaSeq all lanes' => sub { From 593d819d70434e4b1e7f7199736143984810f794 Mon Sep 17 00:00:00 2001 From: Marina Gourtovaia Date: Thu, 16 May 2024 12:13:58 +0100 Subject: [PATCH 07/10] Explained merge options in README --- README.md | 41 +++++++++++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index bb29f5db..d3ec3a7b 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ # NPG Pipelines for Processing Illumina Sequencing Data This software provides the Sanger NPG team's automation for analysing and -internally archiving Illumina sequencing on behalf of DNA Pipelines for their -customers. +internally archiving Illumina sequencing data on behalf of DNA Pipelines for +their customers. There are two main pipelines: @@ -18,16 +18,16 @@ sequencing flowcell, or each tagged library (within a pool on the flowcell). ## Batch Processing and Dependency Tracking with LSF or wr With this system, all of a pipeline's jobs for its steps are submitted for -execution to the LSF, or wr, batch/job processing system as the pipeline is +execution to `LSF` or `wr` batch/job processing system as the pipeline is initialised. As such, a _submitted_ pipeline does not have an orchestration script or daemon running: managing the runtime dependencies of jobs within an instance of a pipeline is delegated to the batch/job processing system. How is this done? The job representing the start point of a graph is submitted -to LSF, or wr, in a suspended state and is resumed once all other jobs have been +to `LSF` or `wr` in a suspended state and is resumed once all other jobs have been submitted thus ensuring that the execution starts only if all steps are -successfully submitted to LSF, or wr. If an error occurs at any point during job -submissions, all submitted jobs, apart from the start job, are killed. +successfully submitted. If an error occurs at any point during job submissions, +all submitted jobs, apart from the start job, are killed. ## Pipeline Creation @@ -84,8 +84,8 @@ The input for an instance of the pipeline is the instrument output run folder (BCL and associated files) and LIMS information which drives appropriate processing. -The key data products are aligned CRAM files and indexes, or unaligned CRAM -files. However per study (a LIMS datum) pipeline configuration allows for the +The key data products are aligned or unaligned CRAM files and indexes. +However per study (a LIMS datum) pipeline configuration allows for the creation of GATK gVCF files, or the running for external tool/pipeline e.g. ncov2012-artic-nf @@ -135,3 +135,28 @@ flow DAGs. Also, the [npg_irods](https://github.com/wtsi-npg/npg_irods) system is essential for the internal archival of data products. + +## Data Merging across Lanes of a Flowcell + +If the same library is sequenced in different lanes of a flowcell, under certain +conditions the pipeline will automatically merge all data for a library into +a single end product. Spiked-in PhiX libraries data and unassigned to any tags +data (tag zero) are not merged. The following scenarios trigger the merge: + +* NovaSeq Standard flowcell - a merge across all two or four lanes is performed. + +* Any flowcell run on a NovaSeqX instrument - if multiple lanes belong to the + same pool, the data from individual libraries will be merged across those + lanes. Thus the output of a NovaSeqX run might contain a mixture of merged and + unmerged products. + +If the data quality in a lane is poor, the lane should be excluded from the merge. +The `--process_separately_lanes` pipeline option is used to list lanes like this. +Usually this option is used when running the analysis pipeline. The pipeline caches +the supplied lane numbers so that the archival pipeline can generate a consistent +with the analysis pipeline list of data products. The same relates to the +`npg_run_is_deletable` script. The cached value is retrieved only if the +`--process_separately_lanes` argument was not set when any of these scripts are +invoked. + + From ea733ffc4fd5e8b4b3842564b5438b5a0db6fd45 Mon Sep 17 00:00:00 2001 From: Marina Gourtovaia Date: Fri, 31 May 2024 17:45:07 +0100 Subject: [PATCH 08/10] Changed mark duplicate method retrieval. Ensured mark duplicate method can be inferred for a product with multiple studies. --- lib/npg_pipeline/product/release.pm | 24 +++++++++++++-- t/20-function-seq_alignment.t | 47 +++++++++++++++++++++++++++-- 2 files changed, 65 insertions(+), 6 deletions(-) diff --git a/lib/npg_pipeline/product/release.pm b/lib/npg_pipeline/product/release.pm index e240921b..2d5a71ca 100644 --- a/lib/npg_pipeline/product/release.pm +++ b/lib/npg_pipeline/product/release.pm @@ -6,6 +6,7 @@ use Data::Dump qw{pp}; use Moose::Role; use List::Util qw{all any}; use Readonly; +use Try::Tiny; with qw{WTSI::DNAP::Utilities::Loggable npg_tracking::util::pipeline_config}; @@ -300,7 +301,7 @@ sub bwakit_enable { Arg [1] : npg_pipeline::product Example : $obj->markdup_method($product); - Description: Return mark duplicate method, + Description: Returns mark duplicate method, the value might be undefined. Returntype : Str @@ -309,7 +310,22 @@ sub bwakit_enable { sub markdup_method { my ($self, $product) = @_; - return $self->find_study_config($product)->{markdup_method}; + + my $config; + try { + $config = $self->find_study_config($product); + } catch { + my $error = $_; + if ($error =~ /Multiple[ ]study[ ]ids/xms) { + $self->logwarn($error); + $self->logwarn('Falling back to the default section of the product config'); + $config = $self->default_study_config(); + } else { + $self->logcroak($error); + } + }; + + return defined $config ? $config->{markdup_method} : undef; } =head2 staging_deletion_delay @@ -412,6 +428,8 @@ study: =item Readonly +=item Try::Tiny + =item WTSI::DNAP::Utilities::Loggable =item npg_tracking::util::pipeline_config @@ -430,7 +448,7 @@ study: =head1 LICENSE AND COPYRIGHT -Copyright (C) 2018,2019,2020,2021,2022 Genome Research Ltd. +Copyright (C) 2018,2019,2020,2021,2022,2024 Genome Research Ltd. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/t/20-function-seq_alignment.t b/t/20-function-seq_alignment.t index 8cf940f3..3bf52866 100644 --- a/t/20-function-seq_alignment.t +++ b/t/20-function-seq_alignment.t @@ -1,6 +1,6 @@ use strict; use warnings; -use Test::More tests => 20; +use Test::More tests => 21; use Test::Exception; use Test::Deep; use Test::Warn; @@ -13,7 +13,7 @@ use Log::Log4perl qw/:levels/; use JSON; use Cwd; use List::Util qw/first/; -use File::Slurp qw/edit_file_lines/; +use File::Slurp qw/edit_file_lines read_file write_file/; use Moose::Util qw(apply_all_roles); @@ -1502,7 +1502,7 @@ subtest 'miseq_primer_panel_only' => sub { is ($d->command(), $command, 'correct command for MiSeq lane 24135_1 tag index 1'); }; -subtest 'product_release_tests' => sub { +subtest 'product_release_tests and mark duplicate method' => sub { plan tests => 269; my %test_runs = ( @@ -1572,6 +1572,47 @@ subtest 'product_release_tests' => sub { } }; +subtest 'mark duplicate method for a product with multiple studies' => sub { + plan tests => 3; + + my $runfolder_path = join q[/], $dir, q[markdups_test]; + mkdir $runfolder_path; + copy('t/data/miseq/46761_RunInfo.xml', "$runfolder_path/RunInfo.xml") or die 'Copy failed'; + copy('t/data/miseq/46761_runParameters.xml', "$runfolder_path/runParameters.xml") + or die 'Copy failed'; + my @lines = read_file(q[t/data/miseq/samplesheet_46761_bwa_mem2.csv]); + my @data = (); + # Change study ID for the first tag. + foreach my $value ((split q[,], $lines[2])) { + $value =~ s/5556/5557/; + push @data, $value; + } + $lines[2] = join q[,], @data; + my $samplesheet = "$runfolder_path/samplesheet_46761.csv"; + write_file($samplesheet, @lines); + local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = $samplesheet; + + my $ms_gen = npg_pipeline::function::seq_alignment->new( + id_run => 46761, + runfolder_path => $runfolder_path, + conf_path => 't/data/release/config/seq_alignment', + resource => $default, + npg_tracking_schema => undef + ); + my $product; + foreach my $p (@{$ms_gen->products->{data_products}}) { + if ($p->rpt_list eq '46761:1:0') { + $product = $p; + last; + } + } + + is ($product->lims->study_ids, 2, 'tag zero product has two study ids'); + my $method; + lives_ok { $method = $ms_gen->markdup_method($product) } + 'no error calling markdup_method'; + is ($method, 'biobambam', 'correct method'); +}; # test overrides of bwa_mem with bwa-mem2 # 1) on sample sheet entry without [bwa_mem2] specified in reference name # 2) on sample sheet entry without [bwa_mem2] specified in reference name, but setting bwa_mem2 attribute From 1b3c555223690f3c948dfafabdd2d4b38427ae28 Mon Sep 17 00:00:00 2001 From: Marina Gourtovaia Date: Thu, 11 Jul 2024 11:01:50 +0100 Subject: [PATCH 09/10] Updated the list of Changes --- Changes | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Changes b/Changes index d2f3d7ae..7ad65ee2 100644 --- a/Changes +++ b/Changes @@ -1,6 +1,9 @@ LIST OF CHANGES --------------- + - Ensured mark duplicate method can be inferred for a product with multiple + studies (tag zero). + release 68.3.0 (2024-05-24) - Removing Tidyp dependency from CI - Added 'SampleSheet.csv' file from the top level of the run folder to From 020d5f4c95e1f14ce5ac1e7ec3e6bc16bda81995 Mon Sep 17 00:00:00 2001 From: jmtcsngr Date: Tue, 6 Aug 2024 10:54:49 +0100 Subject: [PATCH 10/10] prep release 68.4.0 --- Changes | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Changes b/Changes index 7ad65ee2..a8f209dc 100644 --- a/Changes +++ b/Changes @@ -1,8 +1,13 @@ LIST OF CHANGES --------------- +release 68.4.0 (2024-08-06) - Ensured mark duplicate method can be inferred for a product with multiple studies (tag zero). + - Upgrading tests + - Use contemporary run folders for tests (NovaSeqX) + - Clean fixtures + - Prevent tests from accessing live databases (reset HOME) release 68.3.0 (2024-05-24) - Removing Tidyp dependency from CI