malariagen · Lfulcrum · Apr 23, 2021
diff --git a/scripts/post_pipeline_checks/README.md b/scripts/post_pipeline_checks/README.md
@@ -0,0 +1,28 @@
+# Post-pipeline checks 
+
+## For Short-read alignment and SNP-genotyping pipeline output 
+
+### check_missing_samples.sh
+The script `check_missing_samples.sh` is used to quickly check whether all samples are present in an S3 bucket. The number of samples in the bucket and corresponding manifest are counted. Any missing sample is reported.
+
+#### Running the script
+To run the script, login to the farm as a user with permission to list files from S3 buckets via the `s3cmd` tool. Ensure that any manifest files are in the current working directory and named as `<batch>.tsv`. Then run:
+
+```bash
+./check_missing_samples.sh <batch_1> <batch_2> ...
+```
+
+where `<batch_N>` is the name of a production batch, e.g. 1177-VO-ML-LEHMANN-VMF00015.
+
+### check_vcf_integrity.sh
+The script `check_vcf_integrity.sh` is used to check the integrity of VCF files uploaded to S3. For a given batch, the script checks all uploaded vcf.gz files that are below a threshold filesize (2000000000 bytes = 2GB). It compares the number of lines corresponding to each chromosome in these VCF files to an expected number of lines. Any deviations are reported. 
+
+#### Running the script
+To run the script, login to the farm as a user with permission to list and download files from S3 buckets via the `s3cmd` tool. You can run the script with `bsub` to ensure that it has sufficient resources and survives as a long-running job (change the `-q` argument of `bsub` as required).
+
+```bash
+bsub -J <job_name> -o %J.o -e %J.e -M2000 -R'select[mem>2000] rusage[mem=2000]' -q long "./check_vcf_integrity.sh <batch>"
+```
+
+where `<batch>` is the name of the production batch, e.g. 1177-VO-ML-LEHMANN-VMF00015. Check the job output file `<job_id>.o` to view a summary of the number of lines for each chromosome and any deviations from expected values. To see only warnings/deviations, you can run `grep -E '(No rows|unexpected number)'` on script output.
+
diff --git a/scripts/post_pipeline_checks/check_missing_samples.sh b/scripts/post_pipeline_checks/check_missing_samples.sh
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+
+detect_missing_samples() {
+  local manifest=$1
+  shift
+  echo Diffing ${manifest}. Bucket vs sample count: $(s3cmd ls s3://$manifest | grep -Ev '^$|provenance' | sed 's:.*s3\://\(.*\)/\([^.]*\)\..*:\2:g' | sort -u | wc -l) vs $(tail -n +2 $manifest.tsv | grep -v '^$' | awk '{print $1}' | sort -u | wc -l)
+  diff <(cat $manifest.tsv | tail -n +2 | grep -v '^$' | awk '{print $1}' | sort -u) <(s3cmd ls s3://$manifest| grep -Ev '^$|provenance' | sed 's:.*s3\://\(.*\)/\([^.]*\)\..*:\2:g' | sort -u)
+}
+
+for bucket in $@; do
+  detect_missing_samples "${bucket}"	
+  echo
+done
diff --git a/scripts/post_pipeline_checks/check_vcf_integrity.sh b/scripts/post_pipeline_checks/check_vcf_integrity.sh
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+
+module load samtools/1.9
+file_size_threshold_bytes=2000000000
+batch=$1
+
+declare -A expected_rows_per_chromosome
+expected_rows_per_chromosome=(['2R']=60132453 ['3R']=52226568 ['2L']=48525747 ['UNKN']=27274988 \
+                              ['3L']=40758473 ['X']=23385349 ['Y_unplaced']=135155 ['Mt']=15363)
+
+for path in $(s3cmd ls s3://$batch | grep '.*\.vcf.gz$' | awk -v threshold=${file_size_threshold_bytes} '$3 < threshold {print $4}'); do 
+	filename=${path##*/}
+	echo "Downloading $filename ..."
+	s3cmd get ${path} > /dev/null 2>&1
+	echo "Checking number of rows per chromosome in VCF ..."
+	tmpfile=$(mktemp -p $PWD check.XXXXXX)
+	bgzip -dc $filename | cut -f 1 | uniq -cd > "$tmpfile"
+	for chr in "${!expected_rows_per_chromosome[@]}"; do
+		rows=$(grep "$chr" "$tmpfile" | awk '{print $1}')
+		if [ -z "$rows" ]; then
+			echo "No rows found for $chr in ${filename}!"
+		elif [ "$rows" != "${expected_rows_per_chromosome[$chr]}" ]; then
+			echo "An unexpected number of rows ($rows) found for $chr in ${filename}. Expected ${expected_rows_per_chromosome[$chr]}"
+		fi
+	done
+	echo
+	echo "Summary of rows per chromosome for $filename:" 
+	cat "$tmpfile"
+	echo
+	rm "$tmpfile"
+done