-
Notifications
You must be signed in to change notification settings - Fork 3
/
reprocess_public_10x.sh
executable file
·84 lines (66 loc) · 3.83 KB
/
reprocess_public_10x.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/bin/bash
## this is a master script for one-pot download and processing of a 10x dataset
## in case of a particular sample not being recognized as 10x, fastq files would be gzipped (if necessary) and left alone;
## for now, tested use cases would be limited to a GSE (GEO series), E-MTAB (AE series), and a PRJ (BioProject).
## things that need to be installed - hopefully soon we'll pack them into one big Singularity image:
## samtools, seqtk, STAR 2.7.9a+, fastq-dump (= sra-tools), ffq, curl, gzip, wget, grep, Perl, 10x's bamtofastq, srapath
SERIES=$1 ## GSE, E-MTAB, or PRJ
SUBSET=$2 ## a list of sample IDs (have to be GSM, SRS, or ERS) to be processed (other samples will be ignored completely).
if [[ $SERIES == "" ]]
then
>&2 echo "USAGE: reprocess_public_10x.sh <series_id> [sample_list]"
>&2 echo
>&2 echo "This script should take a series ID (GSE from GEO, E-MTAB from ArrayExpress, or PRJ* from bioProjects/SRA),"
>&2 echo "pull/parse all relevant metadata, download/reformat all raw read files, and return fully reprocessed count matrices, generated by STARsolo."
>&2 echo "(I won't promise you it will work every time, but I promise you it will try!)"
>&2 echo
>&2 echo "In case you don't want all the samples from the series processed, provide a list of samples you do want as a second argument [optional]."
>&2 echo
>&2 echo "Please visit https://github.com/cellgeni/reprocess_public_10x for installation and more info."
>&2 echo "(c) Alexander Predeus, Sanger Institute, 2021-2023"
exit 1
fi
## Step 1: set everything up, collect metadata using ffq/curl from ENA, parse the outputs using custom scripts
>&2 echo "=============================== STEP 1: SETUP AND GATHER METADATA =================================="; >&2 echo
if [[ -d $SERIES ]]
then
>&2 echo "ERROR: Cannot create directory $SERIES because it already exists!"
exit 1
else
mkdir $SERIES
fi
if [[ $SUBSET != "" ]]
then
>&2 echo "WARNING: Using file $SUBSET to only process select samples!"
SUBSET=`readlink -f $SUBSET`
if [[ `grep "^GSM" $SUBSET` == "" && `grep "^SRS" $SUBSET` == "" && `grep "^ERS" $SUBSET` == "" ]]
then
>&2 echo "ERROR: The subset file $SUBSET can only contain GSM, SRS, or ERS IDs!"
exit 1
fi
fi
SDIR=`readlink -f $0`
SDIR=`dirname $SDIR`
cd $SERIES
cp $SDIR/scripts/* .
## after all scripts are copied and directories are checked, collect all metadata
./collect_metadata.sh $SERIES $SUBSET
>&2 echo; >&2 echo "=============================== STEP 2: DOWNLOAD ALL THE RELEVANT FILES =================================="; >&2 echo
## download all the necessary raw files using 'transfer' queue on Farm. Did we tell you this whole thing is Sanger-specific?
./continuous_download.sh $SERIES
>&2 echo; >&2 echo "=============================== STEP 3: CONVERT BAM/SRA -> FASTQ.GZ (if needed) =================================="; >&2 echo
## convert BAM and SRA files into properly formatted fastq.gz files. Rename frivolously named BAMs and SRAs along the way.
if [[ `grep -w "BAM$" $SERIES.parsed.tsv` != "" || `grep -w "SRA$" $SERIES.parsed.tsv` != "" ]]
then
./convert_to_fastq.sh $SERIES
fi
>&2 echo; >&2 echo "=============================== STEP 4: GROUP FASTQ BY SAMPLE =================================="; >&2 echo
# at this point we check that every run (SRR/ERR) is matched with a pair of 10x approved fastq.gz files, and notify the user about ones that do not
## reorganize fastqs: 1) make fastqs dir; 2) move all proper files there; 3) group them by sample.
./reorganise_fastqs.sh $SERIES
>&2 echo; >&2 echo "=============================== STEP 5: RUN STARSOLO AND SOLO_QC =================================="; >&2 echo
## actually run STARsolo on all of them! (auto-cleanup?)
./run_starsolo.sh $SERIES
## run solo_QC.sh on all
./solo_QC.sh > $SERIES.solo_qc.tsv
>&2 echo "ALL PROCESSING IS DONE FOR DATASET $SERIES!"