-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathNanopore
253 lines (187 loc) · 22.7 KB
/
Nanopore
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
# First I need to guppy basecall the following fast5 directories;
# oli_green_171023/oli_green_171023_test_1/20231017_1322_P2S-00650-B_PAS21359_60ba7eef/fast5_pass
# oli_purple_011123_test2_2/oli_purple_011123_test2_2_run2/20231101_1044_X2_FAX03109_746b7021/fast5_pass
# oli_purple_301023/oli_purple_301023_test2/20231030_1245_X2_FAX03109_49331880/fast5_pass
## Green
# use --config dna_r10.4.1_e8.2_400bps_sup.cfg
## Purple
# use --config dna_r9.4.1_450bps_sup_plant.cfg
# Code in basecalling bash script (below it is setup for the green fast5 data):
____________________________________________________________________________________________________________________
#!/usr/bin/env bash
# stop script if there is an error
set -ueo pipefail
# clear screen
clear
# Specify absolute path to folder containing fast5_files
InFolder=$(echo "/Volumes/archive/brownfieldlab/Nanopore/white_clover/green/oli_green_171023/oli_green_171023_test_1/20231017_1322_P2S-00650-B_PAS21359_60ba7eef/fast5_pass")
# specify output folder where you want the new fastq files to be stored
out=$(echo "/Volumes/archive/brownfieldlab/Nanopore/white_clover/green/oli_green_171023_SUP_fast5_pass")
# Set path to configuration file to use
# it is generally in the bin/ont-guppy/data folder
supconf=$(echo "/Volumes/archive/brownfieldlab/Nanopore/Guppy/ont-guppy/data/dna_r10.4.1_e8.2_400bps_sup.cfg")
# run guppy using super accuracy basecalling model
# Rebasecalling with guppy 6.5.7+ca6d6af
/Volumes/archive/brownfieldlab/Nanopore/Guppy/ont-guppy/bin/guppy_basecaller -i ${InFolder} -s ${out} -c ${supconf} -x 'cuda:0' --num_callers 8 --trim_adapters --compress_fastq-bash-4.2$
____________________________________________________________________________________________________________________
# Calling the basecalling script
bash ./Scripts/Ayos_code_for_guppyBasecalling.sh
# Then I need to concatenate the two purple fastq datasets
# Just move them all into the same directory, and then concatentate all the fastqs using;
rsync oli_purple_011123_test2_2_SUP_fast5_pass/pass/*.fastq.gz all_purple_SUP_fastqs/
rsync oli_purple_301023_SUP_fast5_pass/pass/*.fastq.gz all_purple_SUP_fastqs/
cat ./all_purple_SUP_fastqs/fastq_runid_*.fastq.gz > all_purple_SUP_called_passed_merged.fastq.gz
# I also needed to concatenate the green SUP fastqs
cat ../oli_green_171023_SUP_fast5_pass/pass/fastq_runid_1f8855ccb02e4c3c344446bb5b06cf95f91d308c_*.fastq.gz > all_green_SUP_called_passed_merged.fastq.gz
# Then I need to filter the new SUP_passed_fastqs
gunzip -c all_purple_SUP_called_passed_merged.fastq.gz | NanoFilt -q 10 -l 1000 --headcrop 50 | gzip > filtered_min1kb_min10qScore_all_SUP_passed_purple.fastq.gz
gunzip -c all_green_SUP_called_passed_merged.fastq.gz | NanoFilt -q 10 -l 1000 --headcrop 50 | gzip > filtered_min1kb_min10qScore_all_SUP_passed_green.fastq.gz
# Then I need to align these new fastqs to the fastq files to HiFi hap1 genome assembly using Vulcan
# This will give 2 bam files (one for green and one for purple)
vulcan -r /Volumes/archive/userdata/student_users/olivernewman/2023/Reference_genomes/Trifolium_repens/UTM_Trep_v1.0_hap1/ncbi_dataset/data/GCA_030914245.1/GCA_030914245.1_UTM_Trep_v1.0_hap1_genomic.fna -i filtered_min1kb_min10qScore_all_SUP_passed_purple.fastq.gz -w ./SUP_vulcan_purple/ -o vulcan_purple_SUP_against_UTM_Trepv1hap1 -t 64 -ont
samtools index vulcan_purple_SUP_against_UTM_Trepv1hap1_90.bam
vulcan -r /Volumes/archive/userdata/student_users/olivernewman/2023/Reference_genomes/Trifolium_repens/UTM_Trep_v1.0_hap1/ncbi_dataset/data/GCA_030914245.1/GCA_030914245.1_UTM_Trep_v1.0_hap1_genomic.fna -i filtered_min1kb_min10qScore_all_SUP_passed_green.fastq.gz -w ./SUP_vulcan_green/ -o vulcan_green_SUP_against_UTM_Trepv1hap1 -t 64 -ont
samtools index vulcan_green_SUP_against_UTM_Trepv1hap1_90.bam
# This is for the green vulcan run:
[M::mm_idx_stat::35.712*2.33] distinct minimizers: 41771855 (46.78% are singletons); average occurrences: 4.497; average spacing: 5.314; total length: 998247995
Done (346417 reads mapped (90.25%), 37424 reads not mapped, 1515154 lines written)(elapsed: 160m, 36 r/s)
# At this point you can load the .bam files (separetly) into IGV with the hap1 ref. seq and inspect the chr8 TrTo ROI (~CM061893.1:45,047,877-45,054,980)
# Loading both bam files will probably crash your computer; therefore if you assemble each into contigs first beforehand, this should be less computationally intensive for IGV
# Then I need to do structural variant calling on these 2 bam files (will give two vcf files (each with only one sample column), and then I can load both of these into IGV)
# Then view them in IGV with the HiFi ref hap1 genome
# First I indexed the UTM_Trep_v1.0_hap1 reference genome
hisat2-build -p 16 ./GCA_030914245.1_UTM_Trep_v1.0_hap1_genomic.fna UTM_Trep_v1_hap1_trans
# Then I had to map each fwd and rv read to the UTM_Trep_v1_hap1 assembly
# This was done once for each of the 10 Paired and trimmed RNA-Seq fastqs
# During these there was generally a ~92-96% alignment rate (which is okay)
hisat2 -p 8 --dta -x ./indexed_UTM_Trep_v1_hap1/UTM_Trep_v1_hap1_trans -1 ./Paired_and_Unpaired_trimmed/Paired_R1s/Sample_1_R1_Paired_trimmed.fastq -2 ./Paired_and_Unpaired_trimmed/Paired_R2s/Sample_1_R2_Paired_trimmed.fastq -S ./mapped/Sample_1.sam
# Then each .sam file was converted to a .bam file
samtools sort -@ 20 -o Sample_1.bam Sample_1.sam
# Then these .bam files were used by StringTie to assemble transcripts (output .gtf files)
stringtie ./bams/Sample_1.bam -l Sample_1 -p 20 -o ./assemblies/sample_1.gtf
# Then these 10 .gtf files were 'merged' into a single .gtf
stringtie --merge -p 20 -o stringtie_UTM_Trep_v1_hapd1_merged.gtf mergelist.txt
# The .bam files were indexed so they could be viewed in IGV:
# Bash script:
SAMPLES="Sample_1 Sample_2 Sample_3 Sample_4 Sample_5 Sample_6 Sample_7 Sample_8 Sample_9 Sample_10"
for SAMPLE in $SAMPLES; do
samtools index ${SAMPLE}.bam ${SAMPLE}.bam.bai
done
# Then I decided to map contig_29575 from the green preliminary assembly (using fast-called reads) to the UTM_Trep_v1.0_hap1 reference genome
seqkit grep -f ../mapping_contig_29575_to_UTM_Trep_v1_hap1/contig_to_extract.txt ./Flye_first_assembly/assembly1/assembly.fasta -o ../mapping_contig_29575_to_UTM_Trep_v1_hap1/contig_29575_from_fastcalled_green_assembly.fasta
vulcan -r /Volumes/archive/userdata/student_users/olivernewman/2023/Reference_genomes/Trifolium_repens/UTM_Trep_v1.0_hap1/ncbi_dataset/data/GCA_030914245.1/GCA_030914245.1_UTM_Trep_v1.0_hap1_genomic.fna -i contig_29575_from_fastcalled_green_assembly.fasta -w ./Fast_vulcan_contig_29575_green_to_UTM_hap1/ -o contig_29575_green_to_UTM_hap1 -t 64 -ont
samtools index contig_29575_green_to_UTM_hap1_90.bam
# This alignment led me to the homologous region to the TrRED LEAF (Rl) region on TrTp subgenome (CM061894.1:19817092)
# Thus I needed to find the correct contig that aligns better to the TrTo chr8 ROI
# To do this I needed to cut out a region from the TrTo chr8 ROI and map this to the assembly fasta
vulcan -r /Volumes/archive/userdata/student_users/olivernewman/2023/Reference_genomes/Trifolium_repens/UTM_Trep_v1.0_hap1/ncbi_dataset/data/GCA_030914245.1/GCA_030914245.1_UTM_Trep_v1.0_hap1_genomic.fna -i contig_29575_from_fastcalled_green_assembly.fasta -w ./Fast_vulcan_contig_29575_green_to_UTM_hap1/ -o contig_29575_green_to_UTM_hap1 -t 64 -ont
# The main hit from this alignment was contig_15937
# Thus I need to extract this contig from the prelim green assembly and then map it to UTM_Trep_v1.0_hap1
seqkit grep -f ./contig_15937_to_extract_from_assembly.txt ../green/Flye_first_assembly/assembly1/assembly.fasta -o ./mapping_contig_15937_from_prelim_assembly.fasta
vulcan -r ~/2023/aligning_RNA_seq_to_UTM_Trep_hap1/indexed_UTM_Trep_v1_hap1/GCA_030914245.1_UTM_Trep_v1.0_hap1_genomic.fna -i mapping_contig_15937_from_prelim_assembly.fasta -w ./mapping_contig_15937_from_prelim_assembly_to_UTM_Trep_v1_hap1/ -o output_mapping_contig_15937_from_prelim_assembly_to_UTM_Trep_v1_hap1 -t 64 -ont
# When I viewed this aligment in IGV, I found that contig_15937 had a gap from 45017416 to 45080182
# This equates to a 62766 bp deletion in TrTo ch8
# Note however that because there are two subgenomes, assembled contigs cannot be used for characterizing indels, because the assembler doesn't know which reads come from which subgenome (although --keep-haplotypes was used, which may account for this issue)
# Thus I need to carry out structural variant calling, and need to just use primary alignments of reads with high MAPQ scores when determining the size of the indel
sniffles -i /Volumes/archive/brownfieldlab/Nanopore/white_clover/green/all_green_SUP_called_merged/SUP_vulcan_green/vulcan_green_SUP_against_UTM_Trepv1hap1_90.bam -v ./green_against_UTM_hap1.vcf --reference /Volumes/archive/userdata/student_users/olivernewman/2023/Reference_genomes/Trifolium_repens/UTM_Trep_v1.0_hap1/ncbi_dataset/data/GCA_030914245.1/GCA_030914245.1_UTM_Trep_v1.0_hap1_genomic.fna -t 64
sniffles -i /Volumes/archive/brownfieldlab/Nanopore/white_clover/purple/all_purple_SUP_called_merged/SUP_vulcan_purple/vulcan_purple_SUP_against_UTM_Trepv1hap1_90.bam -v ./purple_against_UTM_hap1.vcf --reference /Volumes/archive/userdata/student_users/olivernewman/2023/Reference_genomes/Trifolium_repens/UTM_Trep_v1.0_hap1/ncbi_dataset/data/GCA_030914245.1/GCA_030914245.1_UTM_Trep_v1.0_hap1_genomic.fna -t 64
# The SVs didn't look very accurate: the deleted region was characterized as 'reference'
# I should look into using cuteSV, and re-assembling the green contigs
# I need to redo the green assembly using the SUP called reads and few new parameters
flye --nano-hq ../all_green_SUP_called_merged/filtered_min1kb_min10qScore_all_SUP_passed_green.fastq.gz -o assembly2 -g 1g -t 64 -i 5 --read-error 0.03 --meta --keep-haplotypes
Total length: 1538734569
Fragments: 46165
Fragments N50: 113564
Largest frg: 4383983
Scaffolds: 0
Mean coverage: 25
# Then search the new assembly for the contig holding the TrRED LEAF deletion allele
vulcan -r /Volumes/archive/brownfieldlab/Nanopore/white_clover/green/Flye_second_assembly_w_SUP_reads/assembly2/assembly.fasta -i /Volumes/archive/brownfieldlab/Nanopore/white_clover/mapping_chr8_ROI_downstream_of_TrRED_UTM_Trep_v1_hap1_to_green_fastcalled_prelim_assembly/mapping_contig_15937_from_prelim_assembly.fasta -w ./mapping_contig_15937_from_prelim_assembly_to_SUP_green_assembly/ -o output_mapping_contig_15937_from_prelim_assembly_to_SUP_green_assembly -t 64 -ont
# contig_7790 was a match
# I then extracted it from the new assembly
seqkit grep -f ./contig_7790 _name.txt ../Flye_second_assembly_w_SUP_reads/assembly2/assembly.fasta -o ./contig_7790_from_SUP_green_assembly.fasta
# Then mapped it to UTM_hap1 reference genome
vulcan -r ~/2023/aligning_RNA_seq_to_UTM_Trep_hap1/indexed_UTM_Trep_v1_hap1/GCA_030914245.1_UTM_Trep_v1.0_hap1_genomic.fna -i /Volumes/archive/brownfieldlab/Nanopore/white_clover/finding_TrRED_contig_in_SUP_green_assembly/contig_7790_from_SUP_green_assembly.fasta -w ./mapping_contig_7790_from_SUP_green_assembly_to_UTM_hap1/ -o output_mapping_contig_7790_from_SUP_green_assembly_to_UTM_hap1 -t 64 -ont
# Turned out that contig_7790 is actually far smaller than contig_15937, indicating that new assembly using the SUP reads, --nano-hq, --read-error 0.03 and --meta parameters is actually worse than the preliminary
# green assembly using the Fastcalled reads... I'm unsure why this is the case
# cuteSV trial
# Look into whether of not ploidy will be taken into account with cuteSV
cuteSV ../../green/all_green_SUP_called_merged/SUP_vulcan_green/vulcan_green_SUP_against_UTM_Trepv1hap1_90.bam ~/2023/Reference_genomes/Trifolium_repens/UTM_Trep_v1.0_hap1/ncbi_dataset/data/GCA_030914245.1/GCA_030914245.1_UTM_Trep_v1.0_hap1_genomic.fna output_cuteSV_green_against_UTM_hap1 ./ --threads 64 --max_cluster_bias_INS 100 --diff_ratio_merging_INS 0.3 --max_cluster_bias_DEL 100 --diff_ratio_merging_DEL 0.3samtools sort vulcan_green_SUP_against_UTM_Trepv1hap1_90.bam > vulcan_green_SUP_against_UTM_Trepv1hap1_90_sorted.bam
cuteSV ../../purple/all_purple_SUP_called_merged/SUP_vulcan_purple/vulcan_purple_SUP_against_UTM_Trepv1hap1_90.bam ~/2023/Reference_genomes/Trifolium_repens/UTM_Trep_v1.0_hap1/ncbi_dataset/data/GCA_030914245.1/GCA_030914245.1_UTM_Trep_v1.0_hap1_genomic.fna output_cuteSV_purple_against_UTM_hap1 ./ --threads 64 --max_cluster_bias_INS 100 --diff_ratio_merging_INS 0.3 --max_cluster_bias_DEL 100 --diff_ratio_merging_DEL 0.3
# Note that the .bam file doesn't need to be sorted
# After running this and loading the vcf files in with the other alignment files, it was found that cuteSV was able to characterise a
# breakend point at CM061893.1:45,017,417. Interestingly, there was not MATE ID for this breakend, indicating an arbitrary rearrangement event with undifined adjacencies
# I then opened the vcf file with 'less' and searched for the breakend ID's (cuteSV.BND.16041 and cuteSV.BND.16042) and found the following information:
CHROM POS ID REF(s) ALT QUAL FILTER INFO
CM061893.1 45017417 cuteSV.BND.16041 c N]CM061894.1:19851877] . PASS PRECISE;SVTYPE=BND;RE=7;RNAMES=NULL GT:DR:DV:PL:GQ ./.:.:7:.,.,.:.
CM061893.1 45017417 cuteSV.BND.16042 c N]CM061894.1:19854780] . PASS PRECISE;SVTYPE=BND;RE=6;RNAMES=NULL GT:DR:DV:PL:GQ ./.:.:6:.,.,.:.
# This means that the softclipped regions beginning at CM061893.1:45017417 (and spanning to the right) map better to the Tpal chr8 region than to the Tocc chr8 region
# This means that TrTo chr8 likely contains a segment of TrTp chr8 in green F2 plants
# This region must've been inverted prior to or during the crossover event
# Note that the TrTp region now in TrTo chr8 is still present in TrTp chr8; suggesting a potential duplication event
# When comparing the promoter region of TrRED LEAF across UTM_hap1, purple F2s and the 'Crau' ref (TrRv5), its clear that there is more homology between 'Crau' and purple F2s
# Therefore I decided to map my longreads to TrRv5 ref genome
vulcan -r /Volumes/archive/userdata/student_users/olivernewman/2023/Reference_genomes/Trifolium_repens/Trifolium_repens/TrR.v5.fasta -i filtered_min1kb_min10qScore_all_SUP_passed_green.fastq.gz -w ./SUP_vulcan_green_TrRv5/ -o vulcan_green_SUP_against_TrRv5 -t 64 -ont
vulcan -r /Volumes/archive/userdata/student_users/olivernewman/2023/Reference_genomes/Trifolium_repens/Trifolium_repens/TrR.v5.fasta -i filtered_min1kb_min10qScore_all_SUP_passed_purple.fastq.gz -w ./SUP_vulcan_purple_TrRv5/ -o vulcan_purple_SUP_against_TrRv5 -t 64 -ont
# I then mapped my RNA-Seq data to UTM_hap2 (TrRED LEAF absent) and compared this to the mapping to UTM_hap1 (TrRED LEAF present) to look to see if there were co-segregating SNPs in both circumstances
# I then mapped green prelim assembly contig_15937 to the Wang et al (2023) reference genome using vulcan
# This was done to which genotype this assembly corresponds to the most out of the other reference genomes than my two F2 long-read datasets
# It is most similar to TrRv5 in terms of orientation of the ROI
# Then I transformed filtered SUP called purple reads into a blastn database and searched it using TrRED LEAF
# Because there are only three reads that map to it (when viewing .bam alignments in IGV), I should see only three reads in the blastn output if there is only a single R locus MYB gene in the purple genotype
seqtk seq -a filtered_min1kb_min10qScore_all_SUP_passed_purple.fastq.gz > filtered_min1kb_min10qScore_all_SUP_passed_purple.fasta
makeblastdb -in filtered_min1kb_min10qScore_all_SUP_passed_purple.fasta -dbtype nucl -out blastdb_filtered_SUP_called_purple
mkdir db_blastdb_filtered_SUP_called_purple
mv blastdb_filtered_SUP_called_purple.* ./db_blastdb_filtered_SUP_called_purple/
blastn -query Tr_RED_LEAF_complete_cds.fasta -db /Volumes/archive/brownfieldlab/Nanopore/white_clover/purple/all_purple_SUP_called_merged/db_blastdb_filtered_SUP_called_purple/blastdb_filtered_SUP_called_purple -outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore salltitles" -num_threads 64 -out blastn_TrREDLEAF_cds.txt
# There were a total of 28 high-confidence hits. Because there were only ~4 reads mapping to the TrRED LEAF gene via UTM_hap1 mapping, this suggested that TrRED LEAF has 1 or more copies in a different region of the genome
# This result prompted me to generate a purple flye assembly using the filtered SUP basecalled purple longreads, and then search this assembly for the TrRED LEAF cds
flye --nano-raw ../filtered_min1kb_min10qScore_all_SUP_passed_purple.fastq.gz -o assembly1_purple -g 1g -t 64 -i 4 --keep-haplotypes
blat ./flye_assembly_purple/assembly1_purple/assembly.fasta Tr_RED_LEAF_complete_cds.fasta Blat_output_TrREDLEAF_against_purple_SUP_flye_assembly.txt
# This blat search however stongly suggested there is just a single TrRED LEAF MYB gene in the purple genotype
# Then I found that within the purple /assembly1_purple/assembly.fasta, TrRED LEAF was held within edge_4556
# I then extracted purple edge_4556 and green first assembly contig_15937 and put them into seperate fasta files
# I then annotated these fasta files to see differences in structure. In theory, because the nanopore-sequenced purple F2 is heterozygous for the
# presence allele of TrRED LEAF, purple edge_4557 should be 'identical' to contig_15937
# To check this I blast-aligned contig_15937 (311520 bp) to edge_4557 (119411 bp)
# This gave 73% Query-cover, 98.23% Percentage identity and an Evalue of 0, indicating high similarity, yet definitly still lots of variation present
# In the T. occindentale reference genome, the region between ND4 and ADP-ribo is only ~11 kb, whereas in my purple plants (edge_4556) it is ~44 kb.
# Interestingly, the distance between ND4 and ADP-ribo in green contig_15937 is almost identical to the respective distance observed in the T. occindentale reference genome (~11 kb)
# This suggests that perhaps TrRED LEAF (Rl) didn't originate from this region in T. occindentale, or that the this region is highly variable even in the T. occindentale progenitor genomes
# When annotating purple edge_4556, there was a large region (~32 kb) between NUCLEAR DEFECTIVE 4-like and the beginning of TrRED LEAF
# Therefore I decided to inspect this region for transposable elements using Inpactor2 software
Inpactor2/Inpactor2.py -f ../32kb_region_inbetween_ND4_and_TrREDLEAF.fasta -o inpactor2_workingDir/ -t 32 -c no
# This showed that there was a large 11.5 kb SIRE LTR in between ND4 and the beginning of TrRED LEAF in my purple F2 plants
# This LTR is not present in UTM_hap1 ROI, meaning it could be unique my my purple F2 plants
# It was also not present in green contig_15937
# It was however present in the same genomic location in the GFL-007 ecotype Florida chromosome 8 reference sequence
# Note that there was no LTRs detected inbetween TrRED LEAF and ADP-ribo in edge_4556
# Then I decided to redo the green F2 Flye assembly using the SUP reads and the same parameters as the purple Flye assembly:
mkdir Flye_third_assembly_w_SUP_reads
flye --nano-raw ./all_green_SUP_called_merged/filtered_min1kb_min10qScore_all_SUP_passed_green.fastq.gz -o ./Flye_third_assembly_w_SUP_reads/assembly3_green -g 1g -t 64 -i 4 --keep-haplotypes
# Then, to try and figure out whether the nanopore sequennced purple plant is definitly het for TrRED LEAF, I needed to try find at least 3 different contigs containing the stable flanking genes; ND4 and ADP-ribosylation factor
blat ../flye_assembly_purple/assembly1_purple/assembly.fasta NUCLEAR_DEFECTIVE_4_like_MSTRG47455.fasta Blat_output_ND4_against_purple_SUP_flye_assembly.txt
blat ../flye_assembly_purple/assembly1_purple/assembly.fasta ADP_ribosyl.fasta Blat_output_ADP_ribo_against_purple_SUP_flye_assembly.txt
# I just focussed on the ADP_ribosylation factor as a stable flanking gene as it only had one copy in each subgenome
# There were two homeologues present, each with two alleles, giving a total of four alleles (and thus four separate flye-assembled edges)
# These the genic region containing the ADP gene in each of these edges was cut out and a pairwize alignment was made from all four of these
# The alignment shows that the two putatively T. occ subgenome edges are similar to each other and distinct from the two from the putatively T. pal subgenome
# and similarly the two putatively T. pal subgenome edges are similar to each other and distinct from the two from the putatively T. occ subgenome
# This confirms that TrRED LEAF is heterozygous in the plant used to make the purple Flye assembly
# Then I wanted to see if there were transposons were TrRED LEAF etc should be in the green Flye assembly
# I used ADP ribosylation factor again to find the edges (within the third green Flye assembly that used SUP reads) that contain this genic region of interest
# Then I imported these edges into genious prime and used Blast to search for the flanking genes (like ADP and ND4)
blat ../Flye_third_assembly_w_SUP_reads/assembly3_green/assembly.fasta ADP_ribosyl.fasta Blat_output_ADP_ribo_against_green_SUP_flye_assembly.txt
# This gave four full high confidence hits likely containing a full ADP ribosylation factor gene; which makes sense in that each homeologue has two versions (giving four in total)
# A pairwise alignment from the ADP ribosylation factor sequence extracted from all four of these edges (not the contigs, but edges) shows that edge_14743 and edge_14745 are from the same subgenome
# and edge_25974 and edge_25971 are from the other subgenome
# Then the next step was to use Inpactor2 to search these edges for transposons
conda activate Inpactor2
/Volumes/userdata/student_users/olivernewman/2023/Inpactor2/Inpactor2/Inpactor2.py -f ../Flye_third_assembly_w_SUP_reads/assembly3_green/misc_edges/all_4_green_SUP_edges_containing_ADP_rib.fasta -o ../inpactor2_searches/ -t 32 -c no
# Then I searched the first green Flye assembly (that used fast-called reads) for ADP-ribosylation factor to see if there were cleaner congregations of edges in this assembly
blat ../Flye_first_assembly/assembly1/assembly.fasta ADP_ribosyl.fasta Blat_output_ADP_ribo_against_green_first_flye_fast_assembly.txt
# This again gave two alleles of two homeologues (one putatively from each subgenome)
# One of the putative subgenomes had a string of edges (containing edge_29574 and edge_29575) joined together nicely with no more than 2 alleles per region
# I then annotated and searched these two edges for transposons
/Volumes/userdata/student_users/olivernewman/2023/Inpactor2/Inpactor2/Inpactor2.py -f ./Flye_first_assembly/assembly1/misc_edges/all_edges_surrounding_ADP_rib_in_one_subgenome.fasta -o ./inpactor2_searches/searching_edges_from_first_green_assembly/ -t 32 -c no
# This showed that there was no transposons between ADP ribosylation factor and ND4, and the minimal distance between these two genes suggested this region (containing these edges) was from the T. pal subgenome