T. repens mapping

# First, a conda environment was established with all HISAT workflow tools and dependancies (called 'rna-seq')

========================================================================================================================
# Extracting splice sites from the T. repens GTF (The python script used here are within the HISAT2 package)
extract_splice_sites.py /Volumes/userdata/student_users/olivernewman/2023/Reference_genomes/Trifolium_repens/Trifolium_repens/repens.fixed.gtf >/Volumes/userdata/student_users/olivernewman/2023/RNA_Seq_HISAT/index/T_repens.ss

# Extracting splice sites from the T. repens GTF (The python script used here are within the HISAT2 package)
extract_exons.py /Volumes/userdata/student_users/olivernewman/2023/Reference_genomes/Trifolium_repens/Trifolium_repens/repens.fixed.gtf >/Volumes/userdata/student_users/olivernewman/2023/RNA_Seq_HISAT/index/T_repens.exon

# These two commands gave empty output files because the repens.fixed.gtf file only specified CDSs (no exons). Therefore the following command was 
unable to use --ss and --exon parameters
========================================================================================================================

# cd to the /index directory
# Index the reference genome and give it the name "TrR_v5_trans"
hisat2-build -p 16 /Volumes/userdata/student_users/olivernewman/2023/Reference_genomes/Trifolium_repens/Trifolium_repens/TrR.v5.fasta TrR_v5_trans

# Map RNA reads to this indexed reference genome (TrR_v5_trans)
# This has to be run for each of the 10 samples
hisat2 -p 20 --dta -x index/TrR_v5_trans -1 ../Paired_and_Unpaired_trimmed/Paired_R1s/Sample_1_R1_Paired_trimmed.fastq -2 ../Paired_and_Unpaired_trimmed/Paired_R2s/Sample_1_R2_Paired_trimmed.fastq -S $mapped_to_TrR_v5_trans/Sample_1.sam

# Sort SAM files into BAM files
# This has to be run for each of the 10 samples
samtools sort -@ 20 -o Sample_1.bam Sample_1.sam

# Assembling transcripts
# This has to be run for each of the 10 samples
stringtie Sample_1.bam -l Sample_1 -p 20 -G ../../Reference_genomes/Trifolium_repens/Trifolium_repens/repens.fixed.gtf -o sample_1.gtf

# Merging sample GTFs together
# 'mergelist.txt' looks like so:
#    sample_1.gtf
#    ...
#    sample_10.gtf
stringtie --merge -p 20 -G ../../Reference_genomes/Trifolium_repens/Trifolium_repens/repens.fixed.gtf -o stringtie_repens_merged.gtf mergelist.txt

# Check how many transcripts are in the new merged GTF
cat stringtie_repens_merged.gtf | grep -v "^#" | awk '$3=="transcript" {print}' | wc -l

# Compare the StringTie transcripts to known transcripts using gffcompare
gffcompare -r ../../Reference_genomes/Trifolium_repens/Trifolium_repens/repens.fixed.gtf -G -o merged stringtie_repens_merged.gtf
# Note:
    68557 reference transcripts loaded.
    23 duplicate reference transcripts discarded.
    124578 query transfrags loaded.
    # Check the stats of the comparison
    # This will be relavent to write about (check what is written here https://bioinfo-dirty-jobs.github.io/rana2//lectures/07.rnaseq_hisat2/ when deciphering stats)
    cat GTF_stuff/merged.stats


# Then estimate the abundances using the estimate_abundance.sh script that was tailored to my samples
# Make sure that each samples specific gtf file is contained in a separate sample directory; this is where the abundance outputs will be placed
./Volumes/archive/userdata/student_users/olivernewman/2023/RNA_Seq_HISAT/estimate_abundance.sh

========================================================================================================================

## Now enter an R environment and load the required packages, then load the phenodata (which is a list of the samples (Sample_N1-5, Sample_P1-5, with "None" or "High" in the second column)
> library(genefilter)
> library(RSkittleBrewer)                                                                                                                                                                                   
> library(devtools)
> library(dplyr)

> pheno_data <- read.csv('/Volumes/archive/userdata/student_users/olivernewman/2023/RNA_Seq_HISAT/Original_GTFs_and_sampleInfo/Sample_pheno_info.csv')                                                      
> head(pheno_data)
> dim(pheno_data)

# Load the abundance data from each 'estimate_abundance.sh' line
> bg_F2_plants <- ballgown(dataDir='/Volumes/archive/userdata/student_users/olivernewman/2023/RNA_Seq_HISAT/mapped_to_TrR_v5_trans/GTF_stuff/abundance',samplePattern='Sample',pData=pheno_data)
> class(bg_F2_plants)
[1] "ballgown"

> methods(class='ballgown')
 [1] dirs            eexpr           expr            expr<-         
 [5] geneIDs         geneNames       gexpr           iexpr          
 [9] indexes         indexes<-       mergedDate      pData          
[13] pData<-         sampleNames     seqnames        show           
[17] structure       subset          texpr           transcriptIDs  
[21] transcriptNames
see '?methods' for accessing help and source code

# Note here that I am more interested in gexpr (gene expression) and texpr (transcript expression) ^
# which would be gexpr(bg_F2_plants) and texpr(bg_F2_plants)

# Filter out transcripts with low variance
> bg_F2_plants_filt <- subset(bg_F2_plants,"rowVars(texpr(bg_F2_plants))>1",genomesubset=TRUE)

# Now call the DE analysis of transcripts and genes
> de_transcripts <- stattest(bg_F2_plants_filt,feature='transcript',covariate='pigmentation',getFC=TRUE,meas='FPKM')
> dim(de_transcripts)
[1] 37354     5
> head(de_transcripts)

> de_genes <- stattest(bg_F2_plants_filt,feature='gene',covariate='pigmentation',getFC=TRUE,meas='FPKM')
> dim(de_genes)
[1] 23843     5

# Get the geneNames from the origninal ballgown object (ballgown::)
> de_transcripts = data.frame(geneNames=ballgown::geneNames(bg_F2_plants_filt),geneIDs=ballgown::geneIDs(bg_F2_plants_filt),de_transcripts)
> head(de_transcripts)

> bg_filt_table=texpr(bg_F2_plants_filt,'all')
> gene_names=unique(bg_filt_table[,9:10])
> head(gene_names)
> features=de_genes$id
> mapped_gene_names=vector()

> for (i in features) {query=gene_names%>%filter(gene_id==i & gene_name !='.'); n_hit=dim(query)[1];if (n_hit ==1){mapped_gene_names=append(mapped_gene_names,query$gene_name[[1]]) } else {mapped_gene_names=append(mapped_gene_names,'.')}}
> de_genes$gene_name <- mapped_gene_names
> de_genes <- de_genes[,c('feature','gene_name','id','fc','pval','qval')]
> head(de_genes)

# Generate log2FC's from the 'fc' column
> de_transcripts[,'log2fc'] <- log2(de_transcripts[,'fc'])
> de_genes[,'log2fc'] <- log2(de_genes[,'fc'])

> de_transcripts <- de_transcripts[, c('geneNames','geneIDs','feature','id','fc',log2fc','pval','qval')]                                                                   
> de_genes <- de_genes[, c('feature','gene_name','id','fc','log2fc','pval','qval')]

> de_transcripts = arrange(de_transcripts,pval)
> de_genes = arrange(de_genes,pval)

> write.csv(de_transcripts,'de_transcripts_unfiltered_repens_ballgown.csv',row.names=FALSE)
> write.csv(de_genes,'de_genes_unfiltered_repens_ballgown.csv',row.names=FALSE)

> table(de_transcripts$qval < 0.05)
FALSE  TRUE 
37247   107

> table(de_genes$qval < 0.05)
FALSE  TRUE 
23729   114 

> de_transcripts %>% filter(qval < 0.05)

> subset_transcripts <- subset(de_transcripts,de_transcripts$qval < 0.05)
> subset_genes <- subset(de_genes,de_genes$qval < 0.05)

> write.csv(subset_transcripts,'de_transcripts_filtered_repens_ballgown.csv',row.names=FALSE)
> write.csv(subset_genes,'de_genes_filtered_repens_ballgown.csv',row.names=FALSE)

> fpkm_transcripts <- texpr(bg_F2_plants_filt,'all') 
> write.csv(fpkm_transcripts,'fpkm_transcripts_repens.csv',row.names=FALSE)                                                  

> fpkm_genes <- gexpr(bg_F2_plants_filt)                                              
> write.csv(fpkm_genes,'fpkm_genes_repens.csv',row.names=TRUE)


# Exit the R environment
# Use grep to extract the ref_id values (and stringTie ids if there is no ref_id available) from the stringTie merged GTF file
grep -v '^#' stringtie_repens_merged.gtf | perl -F'\t' -lane 'BEGIN{%rs=();print "id\tref_id";}{$comment=$F[8];($gid)=($comment=~/gene_id \"([^\"]+)\"/);($rgid)=($comment=~/ref_gene_id \"([^\"]+)\"/);$rgid=(defined $rgid)?$rgid:$gid;$str="$gid\t$rgid";$rs{$str}=1;}END{for $s (keys(%rs)){print $s;}}'> out.txt


# Use cat to cut the 3rd column (gene_ids) from the ballgown DEG table
cat de_genes_filtered_repens_ballgown.csv | csvtk cut -f 3 > de_genes_filtered_repens_ballgown_Ids.txt

# Use grep and the DEG ids (now in de_genes_filtered_repens_ballgown_Ids.txt) to pull a subset of transcripts out of the stringtie_repens_merged.gtf
# This subset should only contain sequences that correspond to ids stated in the de_genes_filtered_repens_ballgown_Ids.txt file (make sure to double check this)
grep -w -f de_genes_filtered_repens_ballgown_Ids.txt ../mapped_to_TrR_v5_trans/GTF_stuff_and_abundances/stringtie_repens_merged.gtf > sub_set.gtf

# Use bedtools to get the fasta sequences (from the TrR.v5.fasta) that correspond to the transcripts stated in the sub_set.gtf
# Double check that the sub_set.fa.out file only contains ids that correspond to the transcripts stated in the sub_set.gtf
bedtools getfasta -fi ../index/TrR.v5.fasta -bed sub_set.gtf -name -fo sub_set.fa.out


# Mapping 2022 data (Gr15 and P12 paired trimmed fastqs) to TrR.v5.fasta
(rna-seq) -bash-4.2$ hisat2 -p 20 --dta -x ../../index/TrR_v5_trans -1 ../../../../Paired_trimmed_for_Trinity/Gr_15_R1_Paired_trimmed.fq -2 ../../../../Paired_trimmed_for_Trinity/Gr_15_R2_Paired_trimmed.fq -S Gr15.sam  
(rna-seq) -bash-4.2$ hisat2 -p 20 --dta -x ../../index/TrR_v5_trans -1 ../../../../Paired_trimmed_for_Trinity/P_12_R1_Paired_trimmed.fq -2 ../../../../Paired_trimmed_for_Trinity/P_12_R2_Paired_trimmed.fq -S P12.sam


# Sequences of interest that were not called by the ballgown DE analysis, e.g., MYB133 and TrWDR, were blatted against the TrR.v5.fasta reference genome and then coordinates viewed in IGV with samples loaded
blat ../Reference_genomes/Trifolium_repens/Trifolium_repens/TrR.v5.fasta TrWDR_protein_mRNA.fasta Blat_output_TrWDR_mRNA_TrR_v5.txt