-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmapping.sh
executable file
·138 lines (120 loc) · 5.13 KB
/
mapping.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#!/bin/bash
# This script does some first basic processing steps after receiving the data from ara.
# Input:
# - foldername containing: fastqs, guppy_logs, sequencing_summary.txt
# Output:
# - new folder containing the guppy logs
# - one fastq file (containing all the concatenated fastqs)
# - a sorted bam file contianing the alignment
coverageBool=false
while getopts hcn:f: opt
do
case $opt in
h )
echo "Usage: mapping.sh [-c] -n -f <folder> "
echo " -h Display this help message and quit"
echo " -f specify the name of the folder in which the data is located."
echo " -n specify the name of the fastq, which will be used as sample name."
echo " -c optional parameter; If -c is set, the coverage will be calculated."
exit 0
;;
f) DIR=$OPTARG;;
n) SAMPLE=$OPTARG;;
c) coverageBool=true;;
\?)
echo "Invalid Option: -$OPTARG" 1>&2
exit 1
;;
esac
done
GENOME='/data/fass1/genomes/Eukaryots/homo_sapiens_done/ucsc/hg38.fa'
cpgIslands="/data/dessertlocal/daria/cpgIslands"
scripts="/home/nu36par/scripts"
# Create a folder guppy_logs and move all guppy logs in there
if [ -d "${DIR}/guppy_logs" ]; then
### Warning: Folder already exists ###
echo "Warning: ${DIR}/guppy_logs exists already. Cannot move Guppy log files."
echo "Skip moving guppy logs."
else
### Control will jump here if $DIR does NOT exists ###
echo "Copying guppy logs to new folder ${DIR}/guppy_logs..."
mkdir "${DIR}/guppy_logs"
mv $DIR/guppy_basecaller_log*.log $DIR/guppy_logs/
fi
# # check whether an outfile exists already which would be overwritten
# if [[ -f "${DIR}/output.fastq" ]]; then
# ### Error if file already exists ###
# echo "Warning: The file ${DIR}/output.fastq already exists already. This file will be used for the following steps!"
# echo "Skip concatenating fastq files!"
# else
# ### Concatenate all fastq files ###
# echo "## Concatenating fastq files"
# mv $DIR/${SAMPLE}.fastq > $DIR/output.fastq
# # rm $DIR/fastq_runid*.fastq
# fi
## mapping
if [[ -f "${DIR}/${SAMPLE}.sorted.bam" ]]; then
### Error if file already exists ###
echo "Warning: The file ${DIR}/${SAMPLE}.sorted.bam exists already. This file will be used for the following steps!"
echo "Skip mapping!"
else
if [[ -f "${DIR}/${SAMPLE}.fastq" ]]; then
echo "## Mapping with minimap2 ..."
nice minimap2 -a -x map-ont $GENOME "${DIR}/${SAMPLE}.fastq" | samtools sort -T tmp -o "${DIR}/${SAMPLE}.sorted.bam"
samtools index "${DIR}/${SAMPLE}.sorted.bam"
else
echo "Warning: The file ${DIR}/${SAMPLE}.fastq does not exist! Cannot do the mapping."
fi
fi
## quality control
if [[ -f "${DIR}/sequencing_summary.txt" ]]; then
if [[ -f "${DIR}/pycoQC.html" ]]; then
echo "pycoQC report already exist. Skippeng Quality Control"
else
source /home/nu36par/miniconda3/etc/profile.d/conda.sh
conda activate pycoQC
echo "## Quality Control with pycoQC ..."
pycoQC -f "${DIR}/sequencing_summary.txt" -a "${DIR}/${SAMPLE}.sorted.bam" -o "${DIR}/${SAMPLE}_pycoQC.html"
fi
else
echo "Warning: The file ${DIR}/sequencing_summary.txt does not exist. Therefore no pycoQC-Report can be generated"
echo "Skip quality contol!"
fi
# get amount of reads
echo "Amount of basecalled reads:"
awk '{s++}END{print s/4}' "${DIR}/${SAMPLE}.fastq" # Anzahl der Zeilen /4
# get amount of bases
echo "Amount of basecalled bases:"
awk 'BEGIN{sum=0;}{if(NR%4==2){sum+=length($0);}}END{print sum;}' "${DIR}/${SAMPLE}.fastq"
## calculate coverage if wanted
if "$coverageBool"; then
echo "## Calculating the coverage..."
source /home/nu36par/miniconda3/etc/profile.d/conda.sh
conda activate coverage
mosdepth --no-per-base --by $cpgIslands/humanCpG_islands.bed ${DIR}/${SAMPLE} ${DIR}/${SAMPLE}.sorted.bam
# get coverage of the human genome:
echo "Coverage of the human genome on average:"
tail -n2 ${DIR}/${SAMPLE}.mosdepth.summary.txt | head -n 1 | cut -f4
## Coverage per CpG island
echo "Coverage over all CpG islands:"
tail -n1 ${DIR}/${SAMPLE}.mosdepth.summary.txt | cut -f4
fi
# ## calculate coverage if wanted
# if "$coverageBool"; then
# echo "## Calculating the coverage..."
# # get coverage on CpG islands
# echo "Coverage of CpG islands on average:"
# samtools depth -a -b $cpgIslands/humanCpG_islands.bed $DIR/${SAMPLE}.sorted.bam | awk 'BEGIN{SUM=0}{sum+=$3}END{print sum/24200434}'
# # get coverage of the human genome:
# echo "Coverage of the human genome on average:"
# hg38_size=$(grep -v ">" $GENOME | wc | awk '{print $3-$1}') # 3257347282
# samtools depth "${DIR}/${SAMPLE}.sorted.bam" | awk -v hsa_size=$hg38_size '{sum+=$3} END { print "Sum = ",sum,"Average = ",sum/hsa_size}'
# ## Coverage per CpG island
# nice bash $scripts/getCoverage_samtoolsUCSC.sh -p $cpgIslands/humanCpG_islands.bed -b "${DIR}/${SAMPLE}.sorted.bam" -o "${DIR}/cgiCoverage.txt"
# # get amount of reads
# echo "Amount of basecalled reads:"
# awk '{s++}END{print s/4}' "${DIR}/${SAMPLE}.fastq" # Anzahl der Zeilen /4
# # get amount of bases
# echo "Amount of basecalled bases:"
# awk 'BEGIN{sum=0;}{if(NR%4==2){sum+=length($0);}}END{print sum;}' "${DIR}/${SAMPLE}.fastq"
# fi