reprocess_public_10x.sh

#!/bin/bash 

## this is a master script for one-pot download and processing of a 10x dataset
## in case of a particular sample not being recognized as 10x, fastq files would be gzipped (if necessary) and left alone; 
## for now, tested use cases would be limited to a GSE (GEO series), E-MTAB (AE series), and a PRJ (BioProject). 

## things that need to be installed - hopefully soon we'll pack them into one big Singularity image:
## samtools, seqtk, STAR 2.7.9a+, fastq-dump (= sra-tools), ffq, curl, gzip, wget, grep, Perl, 10x's bamtofastq, srapath 

SERIES=$1  ## GSE, E-MTAB, or PRJ
SUBSET=$2  ## a list of sample IDs (have to be GSM, SRS, or ERS) to be processed (other samples will be ignored completely). 

if [[ $SERIES == "" ]]
then 
  >&2 echo "USAGE: reprocess_public_10x.sh <series_id> [sample_list]"
  >&2 echo
  >&2 echo "This script should take a series ID (GSE from GEO, E-MTAB from ArrayExpress, or PRJ* from bioProjects/SRA),"
  >&2 echo "pull/parse all relevant metadata, download/reformat all raw read files, and return fully reprocessed count matrices, generated by STARsolo."
  >&2 echo "(I won't promise you it will work every time, but I promise you it will try!)"
  >&2 echo
  >&2 echo "In case you don't want all the samples from the series processed, provide a list of samples you do want as a second argument [optional]."
  >&2 echo
  >&2 echo "Please visit https://github.com/cellgeni/reprocess_public_10x for installation and more info." 
  >&2 echo "(c) Alexander Predeus, Sanger Institute, 2021-2023"
  exit 1
fi

## Step 1: set everything up, collect metadata using ffq/curl from ENA, parse the outputs using custom scripts  
>&2 echo "=============================== STEP 1: SETUP AND GATHER METADATA =================================="; >&2 echo

if [[ -d $SERIES ]] 
then 
  >&2 echo "ERROR: Cannot create directory $SERIES because it already exists!"
  exit 1
else 
  mkdir $SERIES
fi 

if [[ $SUBSET != "" ]]
then 
  >&2 echo "WARNING: Using file $SUBSET to only process select samples!"
  SUBSET=`readlink -f $SUBSET`
  if [[ `grep "^GSM" $SUBSET` == "" && `grep "^SRS" $SUBSET` == "" && `grep "^ERS" $SUBSET` == "" ]]
  then
    >&2 echo "ERROR: The subset file $SUBSET can only contain GSM, SRS, or ERS IDs!"
    exit 1
  fi 
fi 

SDIR=`readlink -f $0`
SDIR=`dirname $SDIR`
cd $SERIES
cp $SDIR/scripts/* .

## after all scripts are copied and directories are checked, collect all metadata
./collect_metadata.sh $SERIES $SUBSET

>&2 echo; >&2 echo "=============================== STEP 2: DOWNLOAD ALL THE RELEVANT FILES =================================="; >&2 echo

## download all the necessary raw files using 'transfer' queue on Farm. Did we tell you this whole thing is Sanger-specific?  
./continuous_download.sh $SERIES

>&2 echo; >&2 echo "=============================== STEP 3: CONVERT BAM/SRA -> FASTQ.GZ (if needed) =================================="; >&2 echo

## convert BAM and SRA files into properly formatted fastq.gz files. Rename frivolously named BAMs and SRAs along the way.
if [[ `grep -w "BAM$" $SERIES.parsed.tsv` != "" || `grep -w "SRA$" $SERIES.parsed.tsv` != "" ]]
then
  ./convert_to_fastq.sh $SERIES 
fi

>&2 echo; >&2 echo "=============================== STEP 4: GROUP FASTQ BY SAMPLE =================================="; >&2 echo

# at this point we check that every run (SRR/ERR) is matched with a pair of 10x approved fastq.gz files, and notify the user about ones that do not 
## reorganize fastqs: 1) make fastqs dir; 2) move all proper files there; 3) group them by sample. 
./reorganise_fastqs.sh $SERIES

>&2 echo; >&2 echo "=============================== STEP 5: RUN STARSOLO AND SOLO_QC =================================="; >&2 echo
## actually run STARsolo on all of them! (auto-cleanup?) 
./run_starsolo.sh $SERIES

## run solo_QC.sh on all
./solo_QC.sh > $SERIES.solo_qc.tsv

>&2 echo "ALL PROCESSING IS DONE FOR DATASET $SERIES!"