RNA-Seq de novo workflow

# Constructing 6-sample transcriptome (samples 1, 3, 5, 6, 8, 10; 3 no-pigment samples and 3 max-pigment samples)
Trinity --seqType fq --max_memory 64G --left /Volumes/scratch/brownfieldlab/Ollie/2023/Paired_R1s/Sample_1_R1_Paired_trimmed.fastq,/Volumes/scratch/brownfieldlab/Ollie/2023/Paired_R1s/Sample_3_R1_Paired_trimmed.fastq,/Volumes/scratch/brownfieldlab/Ollie/2023/Paired_R1s/Sample_5_R1_Paired_trimmed.fastq,/Volumes/scratch/brownfieldlab/Ollie/2023/Paired_R1s/Sample_6_R1_Paired_trimmed.fastq,/Volumes/scratch/brownfieldlab/Ollie/2023/Paired_R1s/Sample_8_R1_Paired_trimmed.fastq,/Volumes/scratch/brownfieldlab/Ollie/2023/Paired_R1s/Sample_10_R1_Paired_trimmed.fastq --right /Volumes/scratch/brownfieldlab/Ollie/2023/Paired_R2s/Sample_1_R2_Paired_trimmed.fastq,/Volumes/scratch/brownfieldlab/Ollie/2023/Paired_R2s/Sample_3_R2_Paired_trimmed.fastq,/Volumes/scratch/brownfieldlab/Ollie/2023/Paired_R2s/Sample_5_R2_Paired_trimmed.fastq,/Volumes/scratch/brownfieldlab/Ollie/2023/Paired_R2s/Sample_6_R2_Paired_trimmed.fastq,/Volumes/scratch/brownfieldlab/Ollie/2023/Paired_R2s/Sample_8_R2_Paired_trimmed.fastq,/Volumes/scratch/brownfieldlab/Ollie/2023/Paired_R2s/Sample_10_R2_Paired_trimmed.fastq --CPU 32 --output trinity_de_novo_6_samples --verbose

# Mapping paired-trimmed fastq files to 6-sample transcriptome, and then estimating abundance of reads per isoform
# The following command was run for each of the 10 samples
# In each RSEM output directory (one per each of the 10 samples), there is a RSEM_*sample_ID*.isoforms.results file and a RSEM.genes.results file
# I used the RSEM_*sample_ID*.isoforms.results files for subsequent matrix construction
/Volumes/archive/userdata/student_users/olivernewman/Downloads/trinityrnaseq-v2.14.0/util/align_and_estimate_abundance.pl --transcripts trinity_de_novo_6_samples/Trinity.fasta --seqType fq --left Paired_R1s_scratch/Sample_1_R1_Paired_trimmed.fastq --right Paired_R2s_scratch/Sample_1_R2_Paired_trimmed.fastq --est_method RSEM --aln_method bowtie2 --trinity_mode --prep_reference --output_dir RSEM_sample_1_using_6SampleTranscriptome

# Generating a count matrix using RSEM method
# The resulting matrix (RSEM.isoform.counts.matrix) is what was used in my edgeR script
/Volumes/archive/userdata/student_users/olivernewman/Downloads/trinityrnaseq-v2.14.0/util/abundance_estimates_to_matrix.pl --est_method RSEM --gene_trans_map /Volumes/archive/scratch/brownfieldlab/Ollie/2023/trinity_de_novo_6_samples/Trinity.fasta.gene_trans_map --name_sample_by_basedir RSEM_sample_1_using_6SampleTranscriptome/RSEM_S1.isoforms.results RSEM_sample_2_using_6SampleTranscriptome/RSEM_S2.isoforms.results RSEM_sample_3_using_6SampleTranscriptome/RSEM_S3.isoforms.results RSEM_sample_4_using_6SampleTranscriptome/RSEM_S4.isoforms.results RSEM_sample_5_using_6SampleTranscriptome/RSEM_S5.isoforms.results RSEM_sample_6_using_6SampleTranscriptome/RSEM_S6.isoforms.results RSEM_sample_7_using_6SampleTranscriptome/RSEM_S7.isoforms.results RSEM_sample_8_using_6SampleTranscriptome/RSEM_S8.isoforms.results RSEM_sample_9_using_6SampleTranscriptome/RSEM_S9.isoforms.results RSEM_sample_10_using_6SampleTranscriptome/RSEM_S10.isoforms.results

# The R markdown edgeR script is located /Volumes/olivernewman/2023/DESeq2_limmaVoom_EdgeR_&_WGCNA_scripts/DE_EdgeR_script.Rmd
# This script outputs all DEGs (/Volumes/olivernewman/2023/DEG_data/EdgeR_outputs/De_novo_mapping_sigDEGs/EdgeR_de_novo_dge.csv) and all significantly DEGs (/Volumes/olivernewman/2023/DEG_data/EdgeR_outputs/De_novo_mapping_sigDEGs/EdgeR_de_novo_SIG_dge.csv)
# The EdgeR_de_novo_SIG_dge.csv was used as the basis for subsequent analyses/annotations


# Extracting the transcript sequences (fasta) from all the significantly DE transcripts determined by the DE_EdgeR_script.Rmd
seqkit grep -f /Volumes/archive/userdata/student_users/olivernewman/2023/DESeq2_scripts_and_outputs/De_novo_sig_results/TRINITY_sig_up_genes_IDs.txt trinity_de_novo_6_samples/Trinity.fasta -o Trinity_sig_up_gene_seqs.fasta
seqkit grep -f /Volumes/archive/userdata/student_users/olivernewman/2023/DESeq2_scripts_and_outputs/De_novo_sig_results/TRINITY_sig_down_genes_IDs.txt trinity_de_novo_6_samples/Trinity.fasta -o Trinity_sig_down_gene_seqs.fasta

seqkit grep -f TRINITY_sig_up_isoforms.txt trinity_de_novo_6_samples/Trinity.fasta -o Trinity_sig_up_isoform_seqs.fasta
seqkit grep -f TRINITY_sig_down_isoforms.txt trinity_de_novo_6_samples/Trinity.fasta -o Trinity_sig_down_isoform_seqs.fasta


# Using Transdecoder to begin annotation of Trinity.fasta
# This was done in a conda environment with TransDecoder v-5.7.0 installed
TransDecoder.LongOrfs -t Trinity.fasta

  # 'Internal' means an AA string was found without a stop nor start codon
  # 'Complete' means an AA string was found with a stop and start codon
  # 3prime_partial means that the 3prime region of the gene is incomplete
  # 5prime_partial means that the 5prime region of the gene is incomplete
  
  
# Using longOrfs.pep and swissprot to get annotated gene IDs for all transcripts in 6-sample Trinity.fasta
  # The hits with the higher bitscore is the hit that you should select
  # Output is a tab-delim table, where colums are: qseqid, sseqid, pident, length, mismatch, gapopen, qstart, qend, sstart, send, evalue, bitscore; which is equivalent to the keyword 'std'
blastp -query swissprot/longest_orfs.pep -db swissprot/swissprot -out tinity_BLAST.out -outfmt 6 -max_target_seqs 5 -num_threads 20 -evalue 1e-5


# Then I converted the longOrfs gff3 file to a gtf file using gffread

================= + ================= + ================= + ================= + ================= + =================

# The nt database from NCBI was downloaded using the following command whilst in a new directory called 'nt'
perl /PATH/TO/update_blastdb.pl --decompress nt

# I then had to add the following line to /olivernewman/.bashrc
export BLASTDB=/Volumes/archive/userdata/student_users/olivernewman/2023/nt

# Blastn search using significantly DEGs seen when using de novo mapping and edgeR
blastn -query /Volumes/archive/userdata/student_users/olivernewman/2023/Ordering_DEGS_trinity_fasta/sig_DE_all_edgeR_trinty_ordered.fasta -db nt -outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore salltitles" -max_target_seqs 1 -num_threads 20 -evalue 1e-5 -out /Volumes/archive/userdata/student_users/olivernewman/2023/blast_searches/De_novo_edgeR_sig_DEGs.txt

# The output of the blast search had some qseqids that were written more than once due to different alignments
# I filtered out these duplicates by only taking the hit with the highest bitscore

#!/usr/bin/env bash

awk 'BEGIN { FS=OFS="\t"; prev_id=""; max_score=0; } { 
        if ($1 != prev_id) { 
            if (prev_id != "") { 
                print max_line; 
            } 
            prev_id = $1; 
            max_score = $12; 
            max_line = $0; 
        } else { 
            if ($12 > max_score) { 
                max_score = $12; 
                max_line = $0; 
            } 
        } 
    } END { 
        print max_line; 
    }' De_novo_edgeR_sig_DEGs.txt > output_file_filtered.tsv
    
# Then the the first column of output_file_filtered.tsv was taken and written to a new text file
cat output_file_filtered.tsv | awk '{print $1}' > IDs_filtered_no_NAs.txt

# The same was done for the original DEG data table (which was ordered in decending log2FCs)
cat EdgeR_de_novo_SIG_dge_ordered.xls | awk '{print $1}' > qseqid_list_full.txt

# Then these two files were compared, to find which qseqids (that are differentially expressed) didn't have a significant blastn hit
diff IDs_filtered_no_NAs.txt qseqid_list_full.txt | grep '>' | awk '{print $2}' > qseqids_without_hits.txt

# Then I manually searched the EdgeR_de_novo_SIG_dge_ordered.xls (using the qseqids_without_hits.txt list) to find the positions 
# of where the trinity IDs (that had no blastn hits) went, and then added each one into the filtered blastn output file (output_file_filtered.tsv) 
# in the correct positions, so that column 1 of both EdgeR_de_novo_SIG_dge_ordered.xls and output_file_filtered.tsv matched completely

# This command is used to remove line-breaks from fasta files once they have been through the 'Ordering.sh' script
sed ':a; $!N; /^>/!s/\n\([^>]\)/\1/; ta; P; D' Trinity_EdgeR_sig_DEG_noBlastHits_ordered.fasta > Trinity_EdgeR_sig_DEG_noBlastHits_ordered_NoLineBreaks.fasta
# This command could also be used:
seqtk seq -l 0 input_file > output_file


# After doing the BLASTn search, Trinity IDs that had hits that were labelled "uncharacterized" were listed in a text file, and then their seqs were extracted from the 6_sample Trinity.fasta
# I made a python script:
______________________________________________________________________________________________________________________________________________________
from Bio import SeqIO

pattern_file = "/Volumes/archive/userdata/student_users/olivernewman/2024/searching_for_de_novo_DEG_uncharacterized/all_2023_DEG_de_novo_edgeR_blastn_uncharacterised_hits.txt"
fasta_file = "/Volumes/scratch/brownfieldlab/Ollie/2023_scratch/trinity_de_novo_6_samples/Trinity.fasta"
output_file = "extracted_sequences.fasta"

# Read the pattern file and extract sequence IDs in the specified order
with open(pattern_file, 'r') as f:
    pattern = [line.strip() for line in f]

# Read the fasta file and store sequences in a dictionary
sequences_dict = {}
for record in SeqIO.parse(fasta_file, "fasta"):
    sequences_dict[record.id] = record

# Extract sequences from the dictionary based on the pattern
sequences = [sequences_dict[seq_id] for seq_id in pattern if seq_id in sequences_dict]

# Write the extracted sequences to the output file
SeqIO.write(sequences, output_file, "fasta")
______________________________________________________________________________________________________________________________________________________
# This made sure the order of the extracted sequences were the same as what I specified in the pattern .txt file
# Then I did a seperate BLASTn search of this multifasta