Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding CNV scripts #91

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Order of launchers that were run to call CNVs in the crosses

bamfolder=/lustre/scratch118/malaria/team112/pipelines/setups/vo_agam/symlink/vo_agam_indelrealign/bam_fix_mates_v2
indexfolder=/lustre/scratch118/malaria/team112/pipelines/setups/vo_agam/symlink/vo_agam_indelrealign/bam_index_v2
~/scripts/CNV_scripts/launchers/setup_folder.sh ~/personal/phase3_crosses_cnv ~/personal/vo_agam_release/v3/metadata/general/AG1000G-X/samples.meta.csv ~/personal/vo_agam_release/v3/metadata/species_calls_20200422/AG1000G-X/samples.species_aim.csv $bamfolder $indexfolder

~/scripts/CNV_scripts/launchers/run_coverage_vobs.sh ~/personal/phase3_crosses_cnv/data/sample_manifest.txt phase3_crosses_cnv

~/scripts/CNV_scripts/launchers/run_diagnostic_reads_vobs.sh ~/personal/phase3_crosses_cnv/data/sample_manifest.txt phase3_crosses_cnv

~/scripts/CNV_scripts/launchers/run_coverage_stats_by_species_vobs.sh phase3_crosses_cnv

~/scripts/CNV_scripts/launchers/run_HMM_vobs.sh phase3_crosses_cnv

bsub -o temp.log -e temp.error ~/scripts/CNV_scripts/scripts/join_species_coverage_variance_files_vobs.sh phase3_crosses_cnv

~/scripts/CNV_scripts/launchers/run_coverage_CNVs_vobs.sh ~/personal/phase3_crosses_cnv/data/sample_metadata_tabs.tsv phase3_crosses_cnv

~/scripts/CNV_scripts/launchers/run_target_regions_analysis_vobs.sh ~/personal/phase3_crosses_cnv/data/sample_manifest_known_species.txt ~/personal/phase3_crosses_cnv/data/sample_speciescalls_tabs.tsv ~/personal/phase3_crosses_cnv/data/sample_metadata_tabs.tsv phase3_crosses_cnv

bsub -o temp.log -e temp.error -R"select[mem>1000] rusage[mem=1000]" -M1000 ~/scripts/CNV_scripts/scripts/join_CNV_output_files_vobs.sh phase3_crosses_cnv

~/scripts/CNV_scripts/launchers/run_modal_CNVs_vobs.sh ~/personal/phase3_crosses_cnv/data/sample_metadata_tabs.tsv phase3_crosses_cnv

# Note. Later, the run_target_regions_analysis script was rerun in order to fix the Cyp6mz_Dupz labelling issue. Another small change was also made to the target_regions_analysis.r script (changing one the 5 nucleotide sequence used to detect Dup10). This doesn't make any difference to any of the results in the .csv files (just in the number of supporting reads, which is recorded in the Rdata file, but not in any of the presence / absence results which are in the csv). The old scripts were moved to target_regions_anlaysis_old.*. The old results were moved into the folder target_regions_analysis_old.

Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Order of launchers that were run for the first pass of phase3 (run_windowed_accessibility.sh and run_coverage_stats_by_species.sh will not need to be run for future sample sets, as the plan is to use the data from phase3 for all subsequent analyses of gambiae and arabiensis)

~/scripts/CNV_scripts/launchers/run_coverage.sh ~/personal/phase3_data/phase3_manifest.txt phase3_cnv
~/scripts/CNV_scripts/launchers/run_diagnostic_reads.sh ~/personal/phase3_data/phase3_manifest.txt phase3_cnv
~/scripts/CNV_scripts/launchers/run_windowed_accessibility.sh
~/scripts/CNV_scripts/launchers/run_coverage_stats_by_species.sh phase3_cnv arabiensis gambcolu
~/scripts/CNV_scripts/launchers/run_HMM.sh ~/personal/phase3_data/phase3_manifest_arabiensis.txt ~/personal/phase3_data/phase3_manifest_gambcolu.txt phase3_cnv
bsub -o temp.log -e temp.error ~/scripts/CNV_scripts/scripts/join_gambiae_arabiensis_coverage_variance_files.sh phase3_cnv
~/scripts/CNV_scripts/launchers/run_coverage_CNVs.sh ~/personal/phase3_data/phase3_manifest_gambcolu.txt ~/personal/phase3_data/phase3_manifest_arabiensis.txt ~/personal/phase3_data/phase3.samples.meta.csv phase3_cnv
# The following bsub was really confusing to me. If I run it with no memory requirement (or with memory requirement of 500M), it fails because of excess memory usage. But if I run it like this with a 1Gb memory requirement, it runs fine and its maximum memory usage is 11M.
bsub -o temp.log -e temp.error -R"select[mem>1000] rusage[mem=1000]" -M1000 ~/scripts/CNV_scripts/scripts/join_CNV_output_files.sh phase3_cnv
~/scripts/CNV_scripts/launchers/run_target_regions_analysis.sh ~/personal/phase3_data/phase3_manifest_known_species.txt ~/personal/phase3_data/samples.species_aim.csv ~/personal/phase3_data/phase3.samples.meta.csv phase3_cnv
~/scripts/CNV_scripts/launchers/run_modal_CNVs.sh ~/personal/phase3_data/phase3_manifest_gambcolu.txt ~/personal/phase3_data/phase3_manifest_arabiensis.txt ~/personal/phase3_data/phase3.samples.meta.csv phase3_cnv

# Note. Later, the run_target_regions_analysis script was rerun in order to fix the Cyp6mz_Dupz labelling issue. Another small change was also made to the target_regions_analysis.r script (changing one the 5 nucleotide sequence used to detect Dup10). This doesn't make any difference to any of the results in the .csv files (just in the number of supporting reads, which is recorded in the Rdata file, but not in any of the presence / absence results which are in the csv). The old scripts were moved to target_regions_anlaysis_old.*. The old results were moved into the folder target_regions_analysis_old.
55 changes: 55 additions & 0 deletions pipelines/cnv-vector/CNV_scripts/launchers/run_HMM.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
arabiensis_samplelist=$1
gambcolu_samplelist=$2
study=$3

scriptsfolder=~/scripts/CNV_scripts/scripts
rootfolder=/lustre/scratch118/malaria/team112/personal/el10
coveragefolder=$rootfolder/$study/coverage
logfolder=$coveragefolder/logfolders/HMM
errorfolder=$coveragefolder/errorfolders/HMM

Agam_GC_file=$rootfolder/phase3_data/tables/Agam_genome_GC_content.csv
arabiensis_coverage_by_GC_file=median_coverage_by_GC_masked_09_05_arabiensis.csv
arabiensis_coverage_variance_file=coverage_variance_masked_09_05_arabiensis.csv
arabiensis_mapq_prop_file=mapq_proportions_allchrom_arabiensis.csv
gambcolu_coverage_by_GC_file=median_coverage_by_GC_masked_09_05_gambcolu.csv
gambcolu_coverage_variance_file=coverage_variance_masked_09_05_gambcolu.csv
gambcolu_mapq_prop_file=mapq_proportions_allchrom_gambcolu.csv

allchrom=(2L 2R 3L 3R X)

echo "Running HMM for samples from $arabiensis_samplelist and $gamcolu_samplelist on `date`." >> $coveragefolder/HMM_added_samples.log

# Create the outputfolders if necessary
mkdir -p $logfolder
mkdir -p $errorfolder
for chrom in ${allchrom[@]}
do
mkdir -p $coveragefolder/$chrom/HMM_logs_arabiensis
mkdir -p $coveragefolder/$chrom/HMM_logs_gambcolu
mkdir -p $coveragefolder/$chrom/HMM_output

# arabiensis
bsub -o $logfolder/HMM_output_arabiensis_%J.txt \
-e $errorfolder/HMM_error_arabiensis_%J.txt \
${scriptsfolder}/coverage_HMM.sh $coveragefolder \
$arabiensis_samplelist \
$chrom \
arabiensis \
$Agam_GC_file \
$arabiensis_coverage_by_GC_file \
$arabiensis_coverage_variance_file \
$arabiensis_mapq_prop_file

# gambiae coluzzii
bsub -o $logfolder/HMM_output_gambcolu_%J.txt \
-e $errorfolder/HMM_error_gambcolu_%J.txt \
${scriptsfolder}/coverage_HMM.sh $coveragefolder \
$gambcolu_samplelist \
$chrom \
gambcolu \
$Agam_GC_file \
$gambcolu_coverage_by_GC_file \
$gambcolu_coverage_variance_file \
$gambcolu_mapq_prop_file
done
52 changes: 52 additions & 0 deletions pipelines/cnv-vector/CNV_scripts/launchers/run_HMM_vobs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
study=$1

scriptsfolder=~/scripts/CNV_scripts/scripts
rootfolder=/lustre/scratch118/malaria/team112/personal/el10
coveragefolder=$rootfolder/$study/coverage
manifestfolder=$rootfolder/$study/data
logfolder=$coveragefolder/logfolders/HMM
errorfolder=$coveragefolder/errorfolders/HMM

Agam_GC_file=$rootfolder/phase3_data/tables/Agam_genome_GC_content.csv

allchrom=(2L 2R 3L 3R X)

# Create the outputfolders if necessary
mkdir -p $logfolder
mkdir -p $errorfolder

all_manifests=($(ls $manifestfolder/sample_manifest_*.txt))

for sample_manifest in ${all_manifests[@]}
do
# Pull out the species from the manifest names
s=${sample_manifest#$manifestfolder\/sample_manifest_}
species=${s%.txt}
if [ $species == "known_species" ]; then
continue
fi
echo "Running HMM for samples from $sample_manifest on `date`." >> $coveragefolder/HMM_added_samples.log

# Get some of the necessary input filenames
coverage_by_GC_file=median_coverage_by_GC_masked_09_05_${species}.csv
coverage_variance_file=coverage_variance_masked_09_05_${species}.csv
mapq_prop_file=$rootfolder/phase3_cnv/coverage/mapq_proportions_allchrom_${species}.csv

# Run the HMM script on each species on each chromosome
for chrom in ${allchrom[@]}
do
mkdir -p $coveragefolder/$chrom/HMM_output
mkdir -p $coveragefolder/$chrom/HMM_logs_$species

bsub -o $logfolder/HMM_output_${species}_%J.txt \
-e $errorfolder/HMM_error_${species}_%J.txt \
${scriptsfolder}/coverage_HMM_vobs.sh $coveragefolder \
$sample_manifest \
$chrom \
$species \
$Agam_GC_file \
$coverage_by_GC_file \
$coverage_variance_file \
$mapq_prop_file
done
done
32 changes: 32 additions & 0 deletions pipelines/cnv-vector/CNV_scripts/launchers/run_SSFA.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
samplelist=$1
study=$2

scriptsfolder=~/scripts/CNV_scripts/scripts
rootfolder=/lustre/scratch118/malaria/team112/personal/el10
bamfilefolder=$rootfolder/bamlinks/phase3
outputfolder=$rootfolder/$study/SSFA
logfolder=$outputfolder/logfolders/SSFA_detection
errorfolder=$outputfolder/errorfolders/SSFA_detection

mkdir -p $outputfolder
echo "Identifying SSFA reads for samples from ${samplelist} on `date`." >> $outputfolder/added_samples.log

# Create the outputfolders if necessary
mkdir -p $logfolder
mkdir -p $errorfolder
mkdir -p $outputfolder/2R/Ace1_region/SSFAlogs
mkdir -p $outputfolder/2R/Cyp6_region/SSFAlogs
mkdir -p $outputfolder/3R/Cyp6zm_region/SSFAlogs
mkdir -p $outputfolder/3R/Gste_region/SSFAlogs
mkdir -p $outputfolder/X/Cyp9k1_region/SSFAlogs

# Get the number of bamfiles that need processing
numbams=($(wc -l $samplelist))

bsub -J "coverageArray[1-$numbams]" \
-R"select[mem>300] rusage[mem=300]" \
-M300 \
-o $logfolder/SSFA_output_%J.%I.txt \
-e $errorfolder/SSFA_error_%J.%I.txt \
' '${scriptsfolder}'/SSFA.sh '${bamfilefolder}' '${samplelist}' ${LSB_JOBINDEX} '$outputfolder

31 changes: 31 additions & 0 deletions pipelines/cnv-vector/CNV_scripts/launchers/run_breakpoints.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
samplelist=$1
study=$2

scriptsfolder=~/scripts/CNV_scripts/scripts
rootfolder=/lustre/scratch118/malaria/team112/personal/el10
bamfilefolder=$rootfolder/bamlinks/phase3
outputfolder=$rootfolder/$study/breakpoints
logfolder=$outputfolder/logfolders/breakpoint_detection
errorfolder=$outputfolder/errorfolders/breakpoint_detection

mkdir -p $outputfolder
echo "Identifying breakpoint reads for samples from ${samplelist} on `date`." >> $outputfolder/added_samples.log

# Create the outputfolders if necessary
mkdir -p $logfolder
mkdir -p $errorfolder
mkdir -p $outputfolder/2R/Ace1_region/breakpointlogs
mkdir -p $outputfolder/2R/Cyp6_region/breakpointlogs
mkdir -p $outputfolder/3R/Cyp6zm_region/breakpointlogs
mkdir -p $outputfolder/3R/Gste_region/breakpointlogs
mkdir -p $outputfolder/X/Cyp9k1_region/breakpointlogs

# Get the number of bamfiles that need processing
numbams=($(wc -l $samplelist))

bsub -J "coverageArray[1-$numbams]" \
-q small \
-o $logfolder/breakpoint_output_%J.%I.txt \
-e $errorfolder/breakpoint_error_%J.%I.txt \
' '${scriptsfolder}'/breakpoints.sh '${bamfilefolder}' '${samplelist}' ${LSB_JOBINDEX} '$outputfolder

32 changes: 32 additions & 0 deletions pipelines/cnv-vector/CNV_scripts/launchers/run_coverage.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
samplelist=$1
study=$2

scriptsfolder=~/scripts/CNV_scripts/scripts
rootfolder=/lustre/scratch118/malaria/team112/personal/el10
bamfilefolder=$rootfolder/bamlinks/phase3
outputfolder=$rootfolder/$study/coverage
logfolder=$outputfolder/logfolders/coverage_calculation
errorfolder=$outputfolder/errorfolders/coverage_calculation
allchrom=(2L 2R 3L 3R X)

mkdir -p $outputfolder
echo "Calculating coverage for samples from ${samplelist} on `date`." >> $outputfolder/added_samples.log

# Create the outputfolders if necessary
mkdir -p $logfolder
mkdir -p $errorfolder
for chrom in ${allchrom[@]}
do
mkdir -p $outputfolder/$chrom/coveragelogs
done

# Get the number of bamfiles that need processing
numbams=($(wc -l $samplelist))

bsub -J "coverageArray[1-$numbams]" \
-R"select[mem>300] rusage[mem=300]" \
-M300 \
-o $logfolder/coverage_output_%J.%I.txt \
-e $errorfolder/coverage_error_%J.%I.txt \
' '${scriptsfolder}'/get_windowed_coverage.sh '${bamfilefolder}' '${samplelist}' ${LSB_JOBINDEX} '$outputfolder

49 changes: 49 additions & 0 deletions pipelines/cnv-vector/CNV_scripts/launchers/run_coverage_CNVs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
gambcolu_samplelist=$1
arabiensis_samplelist=$2
metadata_file=$3
study=$4

scriptsfolder=~/scripts/CNV_scripts/scripts
rootfolder=/lustre/scratch118/malaria/team112/personal/el10
coveragefolder=$rootfolder/$study/coverage
coverage_variance_file=$coveragefolder/coverage_variance_masked_09_05_all.csv
ncores=2
logfolder=$coveragefolder/logfolders/CNV_analysis
errorfolder=$coveragefolder/errorfolders/CNV_analysis

allchrom=(2L 2R 3L 3R X)

mkdir -p $logfolder
mkdir -p $errorfolder
for chrom in ${allchrom[@]}
do
mkdir -p $coveragefolder/$chrom/CNV_analysis_logs

bsub -o $logfolder/CNV_analysis_output_${chrom}_gambcolu_%J.txt \
-e $errorfolder/CNV_analysis_error_${chrom}_gambcolu_%J.txt \
-n $ncores \
-q long \
-R"select[mem>200] rusage[mem=200] span[hosts=1]" \
-M200 \
${scriptsfolder}/coverage_CNVs.sh $coveragefolder \
$gambcolu_samplelist \
$chrom \
gambcolu_CNV \
$coverage_variance_file \
$ncores \
$metadata_file

bsub -o $logfolder/CNV_analysis_output_${chrom}_arabiensis_%J.txt \
-e $errorfolder/CNV_analysis_error_${chrom}_arabiensis_%J.txt \
-n $ncores \
-q long \
-R"select[mem>200] rusage[mem=200] span[hosts=1]" \
-M200 \
${scriptsfolder}/coverage_CNVs.sh $coveragefolder \
$arabiensis_samplelist \
$chrom \
arabiensis_CNV \
$coverage_variance_file \
$ncores \
$metadata_file
done
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
metadata_file=$1
study=$2

scriptsfolder=~/scripts/CNV_scripts/scripts
rootfolder=/lustre/scratch118/malaria/team112/personal/el10
coveragefolder=$rootfolder/$study/coverage
manifestfolder=$rootfolder/$study/data
coverage_variance_file=$coveragefolder/coverage_variance_masked_09_05_all.csv
gene_coordinates_file=$rootfolder/phase3_data/tables/gene_regions.csv
detox_genes_file=$rootfolder/phase3_data/tables/detox_genes.txt
ncores=2
logfolder=$coveragefolder/logfolders/CNV_analysis
errorfolder=$coveragefolder/errorfolders/CNV_analysis

allchrom=(2L 2R 3L 3R X)

mkdir -p $logfolder
mkdir -p $errorfolder

all_manifests=($(ls $manifestfolder/sample_manifest_*.txt))

for sample_manifest in ${all_manifests[@]}
do
# Pull out the species from the manifest names
s=${sample_manifest#$manifestfolder\/sample_manifest_}
species=${s%.txt}
if [ $species == "known_species" ]; then
continue
fi

output_id=${species}_CNV

for chrom in ${allchrom[@]}
do
mkdir -p $coveragefolder/$chrom/CNV_analysis_logs

bsub -o $logfolder/CNV_analysis_output_${chrom}_${species}_%J.txt \
-e $errorfolder/CNV_analysis_error_${chrom}_${species}_%J.txt \
-n $ncores \
-q long \
-R"select[mem>200] rusage[mem=200] span[hosts=1]" \
-M200 \
${scriptsfolder}/coverage_CNVs_vobs.sh $coveragefolder \
$sample_manifest \
$chrom \
$output_id \
$coverage_variance_file \
$ncores \
$metadata_file \
$gene_coordinates_file \
$detox_genes_file
done
done
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
study=$1
specieslist=${@:2}

rootfolder=/lustre/scratch118/malaria/team112/personal/el10
logfolder=$rootfolder/phase3_cnv/coverage/logfolders/stats_by_species
errorfolder=$rootfolder/phase3_cnv/coverage/errorfolders/stats_by_species
workingfolder=$rootfolder/$study/coverage
datafolder=$rootfolder/phase3_data
scriptsfolder=~/scripts/CNV_scripts/scripts

mkdir -p $logfolder
mkdir -p $errorfolder

for species in $specieslist
do
bsub -o $logfolder/${species}_coverage_stats_log.txt \
-e $errorfolder/${species}_coverage_stat_error.txt \
${scriptsfolder}/get_coverage_stats_by_sample_set.sh $workingfolder \
$datafolder/phase3_manifest_${species}.txt \
$datafolder/windowed_accessibility/mean_accessibility_${species}.csv \
$datafolder/tables/Agam_genome_GC_content.csv \
$species
done

Loading