-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathT. repens mapping
176 lines (126 loc) · 9.97 KB
/
T. repens mapping
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
# First, a conda environment was established with all HISAT workflow tools and dependancies (called 'rna-seq')
========================================================================================================================
# Extracting splice sites from the T. repens GTF (The python script used here are within the HISAT2 package)
extract_splice_sites.py /Volumes/userdata/student_users/olivernewman/2023/Reference_genomes/Trifolium_repens/Trifolium_repens/repens.fixed.gtf >/Volumes/userdata/student_users/olivernewman/2023/RNA_Seq_HISAT/index/T_repens.ss
# Extracting splice sites from the T. repens GTF (The python script used here are within the HISAT2 package)
extract_exons.py /Volumes/userdata/student_users/olivernewman/2023/Reference_genomes/Trifolium_repens/Trifolium_repens/repens.fixed.gtf >/Volumes/userdata/student_users/olivernewman/2023/RNA_Seq_HISAT/index/T_repens.exon
# These two commands gave empty output files because the repens.fixed.gtf file only specified CDSs (no exons). Therefore the following command was
unable to use --ss and --exon parameters
========================================================================================================================
# cd to the /index directory
# Index the reference genome and give it the name "TrR_v5_trans"
hisat2-build -p 16 /Volumes/userdata/student_users/olivernewman/2023/Reference_genomes/Trifolium_repens/Trifolium_repens/TrR.v5.fasta TrR_v5_trans
# Map RNA reads to this indexed reference genome (TrR_v5_trans)
# This has to be run for each of the 10 samples
hisat2 -p 20 --dta -x index/TrR_v5_trans -1 ../Paired_and_Unpaired_trimmed/Paired_R1s/Sample_1_R1_Paired_trimmed.fastq -2 ../Paired_and_Unpaired_trimmed/Paired_R2s/Sample_1_R2_Paired_trimmed.fastq -S $mapped_to_TrR_v5_trans/Sample_1.sam
# Sort SAM files into BAM files
# This has to be run for each of the 10 samples
samtools sort -@ 20 -o Sample_1.bam Sample_1.sam
# Assembling transcripts
# This has to be run for each of the 10 samples
stringtie Sample_1.bam -l Sample_1 -p 20 -G ../../Reference_genomes/Trifolium_repens/Trifolium_repens/repens.fixed.gtf -o sample_1.gtf
# Merging sample GTFs together
# 'mergelist.txt' looks like so:
# sample_1.gtf
# ...
# sample_10.gtf
stringtie --merge -p 20 -G ../../Reference_genomes/Trifolium_repens/Trifolium_repens/repens.fixed.gtf -o stringtie_repens_merged.gtf mergelist.txt
# Check how many transcripts are in the new merged GTF
cat stringtie_repens_merged.gtf | grep -v "^#" | awk '$3=="transcript" {print}' | wc -l
# Compare the StringTie transcripts to known transcripts using gffcompare
gffcompare -r ../../Reference_genomes/Trifolium_repens/Trifolium_repens/repens.fixed.gtf -G -o merged stringtie_repens_merged.gtf
# Note:
68557 reference transcripts loaded.
23 duplicate reference transcripts discarded.
124578 query transfrags loaded.
# Check the stats of the comparison
# This will be relavent to write about (check what is written here https://bioinfo-dirty-jobs.github.io/rana2//lectures/07.rnaseq_hisat2/ when deciphering stats)
cat GTF_stuff/merged.stats
# Then estimate the abundances using the estimate_abundance.sh script that was tailored to my samples
# Make sure that each samples specific gtf file is contained in a separate sample directory; this is where the abundance outputs will be placed
./Volumes/archive/userdata/student_users/olivernewman/2023/RNA_Seq_HISAT/estimate_abundance.sh
========================================================================================================================
## Now enter an R environment and load the required packages, then load the phenodata (which is a list of the samples (Sample_N1-5, Sample_P1-5, with "None" or "High" in the second column)
> library(genefilter)
> library(RSkittleBrewer)
> library(devtools)
> library(dplyr)
> pheno_data <- read.csv('/Volumes/archive/userdata/student_users/olivernewman/2023/RNA_Seq_HISAT/Original_GTFs_and_sampleInfo/Sample_pheno_info.csv')
> head(pheno_data)
> dim(pheno_data)
# Load the abundance data from each 'estimate_abundance.sh' line
> bg_F2_plants <- ballgown(dataDir='/Volumes/archive/userdata/student_users/olivernewman/2023/RNA_Seq_HISAT/mapped_to_TrR_v5_trans/GTF_stuff/abundance',samplePattern='Sample',pData=pheno_data)
> class(bg_F2_plants)
[1] "ballgown"
> methods(class='ballgown')
[1] dirs eexpr expr expr<-
[5] geneIDs geneNames gexpr iexpr
[9] indexes indexes<- mergedDate pData
[13] pData<- sampleNames seqnames show
[17] structure subset texpr transcriptIDs
[21] transcriptNames
see '?methods' for accessing help and source code
# Note here that I am more interested in gexpr (gene expression) and texpr (transcript expression) ^
# which would be gexpr(bg_F2_plants) and texpr(bg_F2_plants)
# Filter out transcripts with low variance
> bg_F2_plants_filt <- subset(bg_F2_plants,"rowVars(texpr(bg_F2_plants))>1",genomesubset=TRUE)
# Now call the DE analysis of transcripts and genes
> de_transcripts <- stattest(bg_F2_plants_filt,feature='transcript',covariate='pigmentation',getFC=TRUE,meas='FPKM')
> dim(de_transcripts)
[1] 37354 5
> head(de_transcripts)
> de_genes <- stattest(bg_F2_plants_filt,feature='gene',covariate='pigmentation',getFC=TRUE,meas='FPKM')
> dim(de_genes)
[1] 23843 5
# Get the geneNames from the origninal ballgown object (ballgown::)
> de_transcripts = data.frame(geneNames=ballgown::geneNames(bg_F2_plants_filt),geneIDs=ballgown::geneIDs(bg_F2_plants_filt),de_transcripts)
> head(de_transcripts)
> bg_filt_table=texpr(bg_F2_plants_filt,'all')
> gene_names=unique(bg_filt_table[,9:10])
> head(gene_names)
> features=de_genes$id
> mapped_gene_names=vector()
> for (i in features) {query=gene_names%>%filter(gene_id==i & gene_name !='.'); n_hit=dim(query)[1];if (n_hit ==1){mapped_gene_names=append(mapped_gene_names,query$gene_name[[1]]) } else {mapped_gene_names=append(mapped_gene_names,'.')}}
> de_genes$gene_name <- mapped_gene_names
> de_genes <- de_genes[,c('feature','gene_name','id','fc','pval','qval')]
> head(de_genes)
# Generate log2FC's from the 'fc' column
> de_transcripts[,'log2fc'] <- log2(de_transcripts[,'fc'])
> de_genes[,'log2fc'] <- log2(de_genes[,'fc'])
> de_transcripts <- de_transcripts[, c('geneNames','geneIDs','feature','id','fc',log2fc','pval','qval')]
> de_genes <- de_genes[, c('feature','gene_name','id','fc','log2fc','pval','qval')]
> de_transcripts = arrange(de_transcripts,pval)
> de_genes = arrange(de_genes,pval)
> write.csv(de_transcripts,'de_transcripts_unfiltered_repens_ballgown.csv',row.names=FALSE)
> write.csv(de_genes,'de_genes_unfiltered_repens_ballgown.csv',row.names=FALSE)
> table(de_transcripts$qval < 0.05)
FALSE TRUE
37247 107
> table(de_genes$qval < 0.05)
FALSE TRUE
23729 114
> de_transcripts %>% filter(qval < 0.05)
> subset_transcripts <- subset(de_transcripts,de_transcripts$qval < 0.05)
> subset_genes <- subset(de_genes,de_genes$qval < 0.05)
> write.csv(subset_transcripts,'de_transcripts_filtered_repens_ballgown.csv',row.names=FALSE)
> write.csv(subset_genes,'de_genes_filtered_repens_ballgown.csv',row.names=FALSE)
> fpkm_transcripts <- texpr(bg_F2_plants_filt,'all')
> write.csv(fpkm_transcripts,'fpkm_transcripts_repens.csv',row.names=FALSE)
> fpkm_genes <- gexpr(bg_F2_plants_filt)
> write.csv(fpkm_genes,'fpkm_genes_repens.csv',row.names=TRUE)
# Exit the R environment
# Use grep to extract the ref_id values (and stringTie ids if there is no ref_id available) from the stringTie merged GTF file
grep -v '^#' stringtie_repens_merged.gtf | perl -F'\t' -lane 'BEGIN{%rs=();print "id\tref_id";}{$comment=$F[8];($gid)=($comment=~/gene_id \"([^\"]+)\"/);($rgid)=($comment=~/ref_gene_id \"([^\"]+)\"/);$rgid=(defined $rgid)?$rgid:$gid;$str="$gid\t$rgid";$rs{$str}=1;}END{for $s (keys(%rs)){print $s;}}'> out.txt
# Use cat to cut the 3rd column (gene_ids) from the ballgown DEG table
cat de_genes_filtered_repens_ballgown.csv | csvtk cut -f 3 > de_genes_filtered_repens_ballgown_Ids.txt
# Use grep and the DEG ids (now in de_genes_filtered_repens_ballgown_Ids.txt) to pull a subset of transcripts out of the stringtie_repens_merged.gtf
# This subset should only contain sequences that correspond to ids stated in the de_genes_filtered_repens_ballgown_Ids.txt file (make sure to double check this)
grep -w -f de_genes_filtered_repens_ballgown_Ids.txt ../mapped_to_TrR_v5_trans/GTF_stuff_and_abundances/stringtie_repens_merged.gtf > sub_set.gtf
# Use bedtools to get the fasta sequences (from the TrR.v5.fasta) that correspond to the transcripts stated in the sub_set.gtf
# Double check that the sub_set.fa.out file only contains ids that correspond to the transcripts stated in the sub_set.gtf
bedtools getfasta -fi ../index/TrR.v5.fasta -bed sub_set.gtf -name -fo sub_set.fa.out
# Mapping 2022 data (Gr15 and P12 paired trimmed fastqs) to TrR.v5.fasta
(rna-seq) -bash-4.2$ hisat2 -p 20 --dta -x ../../index/TrR_v5_trans -1 ../../../../Paired_trimmed_for_Trinity/Gr_15_R1_Paired_trimmed.fq -2 ../../../../Paired_trimmed_for_Trinity/Gr_15_R2_Paired_trimmed.fq -S Gr15.sam
(rna-seq) -bash-4.2$ hisat2 -p 20 --dta -x ../../index/TrR_v5_trans -1 ../../../../Paired_trimmed_for_Trinity/P_12_R1_Paired_trimmed.fq -2 ../../../../Paired_trimmed_for_Trinity/P_12_R2_Paired_trimmed.fq -S P12.sam
# Sequences of interest that were not called by the ballgown DE analysis, e.g., MYB133 and TrWDR, were blatted against the TrR.v5.fasta reference genome and then coordinates viewed in IGV with samples loaded
blat ../Reference_genomes/Trifolium_repens/Trifolium_repens/TrR.v5.fasta TrWDR_protein_mRNA.fasta Blat_output_TrWDR_mRNA_TrR_v5.txt