g2f_analysis_revised_publish.R

install.packages(gridExtra)
install.packages(ggplot2)
install.packages(data.table)
install.packages(emmeans)
install.packages(multcomp)
install.packages(qiime2R)
install.packages(stringr)
install.packages(dplyr)
install.packages(tibble)
install.packages(tidyr)
install.packages("devtools")
install.packages(microbiome)
install.packages(car)
install.packages(lme4)
install.packages(lme4qtl)
install.packages(gvlma)
install.packages(outliers)
install.packages("lmerTest")
install.packages("Maaslin2")
#install.packages("EnvStats")
install.packages(EnvStats)
if (!requireNamespace("devtools", quietly = TRUE)){install.packages("devtools")}
devtools::install_github("jbisanz/qiime2R")

if(!requireNamespace("BiocManager", quietly = TRUE))
  
  install.packages("BiocManager")

BiocManager::install("Maaslin2")

install.packages(qiime2R)
install.packages(vegan)
install.packages("genefilter")
install.packages("NetCoMi")
if(!require(devtools)) install.packages("devtools")
devtools::install_github("kassambara/ggpubr")

install.packages(metagMisc)
install.packages(phyloseq)
install.packages(geodist)






library(gridExtra)
library(ggplot2)
library(data.table)
library(emmeans)
library(multcomp)
library(qiime2R)
library(stringr)
library(dplyr)
library(tibble)
library(tidyr)
library("devtools")
#library(microbiome)
#library(car)
library(lme4)
library(lme4qtl)
library(gvlma)
library(outliers)
#install.packages("EnvStats")
library(EnvStats)
library(qiime2R)
library(vegan)
library("genefilter")
library("NetCoMi")
library(ggpubr)
library(metagMisc)
library(phyloseq)
#install.packages("geodist")
library(geodist)


#set the work direcotry in where you store all the data from github 
setwd("D:/G2F_Rerun/G2F_data/G2F_data-main/")


#read in the ASV table after mitochondira and chloroplast filteration
otu_table	=	read.table("./dada2_table-no-mitochondria-no-chloroplast.txt",row.names =1,check.names = FALSE,sep="\t",skip=1,header = TRUE)

#identify the blank taxa 
blank_taxa <- otu_table[rowSums(otu_table[,c(1:10)]) > 1,]

#filter out the blank taxa 
otu_table_blank_taxa_filtered <- otu_table[rowSums(otu_table[,c(1:10)]) < 1,]
#otu_table_blank_taxa_filtered_singleton_remove <- otu_table_blank_taxa_filtered[rowSums(otu_table_blank_taxa_filtered) > 10,] 
#write.table(otu_table_blank_taxa_filtered,"/home/hl46161/new_G2F_dada2/exported_table/dada2_table-no-mitochondria-no-chloroplast-blank-taxa-filtered.txt",sep="\t")


#import otu table and convert to matrix 
otu_table	=	as.matrix(otu_table_blank_taxa_filtered)

#taxonomy come from qiime2 artifact and needs to seperate in domain, phylum,order,......
taxonomy = read.csv("./taxonomy.tsv",sep="\t",row.names =1)

#revise the taxonomy so that uncultured family have previous taxonomy information
revised_taxonomy <- taxonomy


#change the column to character 
revised_taxonomy$Family <- as.character(revised_taxonomy$Family)
revised_taxonomy$Order <- as.character(revised_taxonomy$Order)
revised_taxonomy$Class <- as.character(revised_taxonomy$Class)

#select family taxa that have no taxonomu information until phylumn level and paste the phylumn level information with family information
revised_taxonomy$Family[revised_taxonomy$Family ==" f__uncultured"& revised_taxonomy$Order ==" o__uncultured" & revised_taxonomy$Class ==" c__uncultured"] <-  paste(revised_taxonomy$Family[revised_taxonomy$Family ==" f__uncultured"& revised_taxonomy$Order ==" o__uncultured" & revised_taxonomy$Class ==" c__uncultured"],revised_taxonomy$Phylum[revised_taxonomy$Family ==" f__uncultured"& revised_taxonomy$Order ==" o__uncultured" & revised_taxonomy$Class ==" c__uncultured"])

#select family taxa that have no taxonomu information until class level and paste the class level information with family information
revised_taxonomy$Family[revised_taxonomy$Family ==" f__uncultured"& revised_taxonomy$Order ==" o__uncultured"] <-  paste(revised_taxonomy$Family[revised_taxonomy$Family ==" f__uncultured"& revised_taxonomy$Order ==" o__uncultured"],revised_taxonomy$Class[revised_taxonomy$Family ==" f__uncultured"& revised_taxonomy$Order ==" o__uncultured"])

#select family taxa that have no taxonomu information until order level and paste the order level information with family information
revised_taxonomy$Family[revised_taxonomy$Family ==" f__uncultured"] <-  paste(revised_taxonomy$Family[revised_taxonomy$Family ==" f__uncultured"],revised_taxonomy$Order[revised_taxonomy$Family ==" f__uncultured"])

# revise factor to character to avoid problem in column operation  
revised_taxonomy$Family <- as.factor(revised_taxonomy$Family)
revised_taxonomy$Order <- as.factor(revised_taxonomy$Order)
revised_taxonomy$Class <- as.factor(revised_taxonomy$Class)

#revised_taxonomy <- revised_taxonomy[rownames(revised_taxonomy) %in% rownames(otu_table_blank_taxa_filtered),]
revised_taxonomy	=	as.matrix(revised_taxonomy)
taxonomy <- as.matrix(revised_taxonomy)

#import phylogenetic tree table and convert to matrix
phy_tree	=	read_tree("./tree.nwk")

#import metadata table and convert to matrix 
metadata	=	read.table("./G2f_2019_sub_corrected_metadata.tsv",sep ="\t",header = 1,row.names=1)
#rownames(metadata) <- metadata$SampleID

#formate the iput data 
OTU	=	otu_table(otu_table,taxa_are_rows	=	TRUE)
TAX	=	tax_table(taxonomy)
META = sample_data(metadata)


##build phyloseq object 
ps	=	phyloseq(OTU,TAX,phy_tree,META)

sample_names(META)
sample_names(OTU)

#rarecurve(t(otu_table(ps)), step=100, cex=0.5)

#remove NCH1-1-159 samples for m
ps <- subset_samples(ps, rownames(sample_data(ps)) != "NCH1-1-159")

ps.rarefied = rarefy_even_depth(ps,sample.size=1500, replace=F,rngseed=1)

summary(sample_data(ps.rarefied)$Corrected_pedigree)

`%!in%` <- Negate(`%in%`)

#filtered to contain only yellow stripe species since seed substitution cause some samples are wrong pedigree  
ps.rarefied_ys_filtered <- subset_samples(ps.rarefied, Corrected_pedigree %in% c("PHW52/PHM49","B73/PHM49","F42/H95","F42/MO17","OH43/B37","PHW52/PHN82","B14A/OH43","B14A/H95","B14A/MO17",
                                                                                 "LH74/PHN82","PHG39/PHN82","B73/MO17","B73/PHN82","B37/H95","F42/OH43","B37/MO17","B37/OH43","CG119/CG108",
                                                                                 "CG44/CGR01","2369/LH123HT"))

#create a heatmap to visualize the distribution of pedigree across the location 
correted_pedigree_heatmap <-ggplot(sample_data(ps.rarefied_ys_filtered), aes(x=location,y=Corrected_pedigree)) + 
  geom_tile()

correted_pedigree_heatmap

################################################################################################################################

#calculate the alpha diversity
alphaObserved = estimate_richness(ps.rarefied_ys_filtered, measures="Observed")
alphaSimpson = estimate_richness(ps.rarefied_ys_filtered, measures="Simpson")
alphaShannon = estimate_richness(ps.rarefied_ys_filtered, measures="Shannon")

#merge the alpha diversity and metadata 
G2F_metadata_2019 <- cbind(alphaObserved, sample_data(ps.rarefied_ys_filtered))
G2F_metadata_2019 <- cbind(G2F_metadata_2019, alphaSimpson)
G2F_metadata_2019 <- cbind(G2F_metadata_2019,alphaShannon)

#use kruskal wallis test to test for significant difference between location and pedigree 

kruskal.test(Observed~location, data = G2F_metadata_2019)
kruskal.test(Simpson~location, data = G2F_metadata_2019)
kruskal.test(Shannon~location, data = G2F_metadata_2019)

kruskal.test(Observed~Corrected_pedigree, data = G2F_metadata_2019)
kruskal.test(Simpson~Corrected_pedigree, data = G2F_metadata_2019)
kruskal.test(Shannon~Corrected_pedigree, data = G2F_metadata_2019)

###########################################################################

################ plot the alpha diversity 

Shannon_location <- plot_richness(ps.rarefied_ys_filtered, x="location", measures=c("Shannon")) + geom_boxplot(fill="red") +
  theme(
    legend.text = element_text(color = "black", size = 20),
    legend.title = element_text(color = "black", size = 20),
    axis.title.x = element_text(size=20, face="bold"),
    axis.title.y = element_text(size=20, face="bold"),
    plot.title = element_text(size=20, face="bold"),
    axis.text.y = element_text(size=20, face="bold"),
    axis.text.x = element_text(size=20, face="bold"),
    strip.text.x = element_text(size=20,face="bold"),
    panel.background = element_blank()
  )

Shannon_location

ggsave("./shannon_plot_location_rerun.png", height=10, width=10, device="png")

Observed_location <- plot_richness(ps.rarefied_ys_filtered, x="location", measures=c("Observed")) + geom_boxplot(fill="red") +
  theme(
    legend.text = element_text(color = "black", size = 20),
    legend.title = element_text(color = "black", size = 20),
    axis.title.x = element_text(size=20, face="bold"),
    axis.title.y = element_text(size=20, face="bold"),
    plot.title = element_text(size=20, face="bold"),
    axis.text.y = element_text(size=20, face="bold"),
    axis.text.x = element_text(size=20, face="bold"),
    strip.text.x = element_text(size=20,face="bold"),
    panel.background = element_blank()
  )

Observed_location 

ggsave("./Observed_plot_location_rerun.png", height=10, width=10, device="png")

Simpson_location <- plot_richness(ps.rarefied_ys_filtered, x="location", measures=c("Simpson")) + geom_boxplot(fill="red") + 
  theme(
    legend.text = element_text(color = "black", size = 20),
    legend.title = element_text(color = "black", size = 20),
    axis.title.x = element_text(size=20, face="bold"),
    axis.title.y = element_text(size=20, face="bold"),
    plot.title = element_text(size=20, face="bold"),
    axis.text.y = element_text(size=20, face="bold"),
    axis.text.x = element_text(size=20, face="bold"),
    strip.text.x = element_text(size=20,face="bold"),
    panel.background = element_blank()
  )

Simpson_location

Shannon_pedigree <- plot_richness(ps.rarefied_ys_filtered, x="Corrected_pedigree", measures=c("Shannon")) + geom_boxplot(fill="red") + 
  theme(
    legend.text = element_text(color = "black", size = 20),
    legend.title = element_text(color = "black", size = 20),
    axis.title.x = element_text(size=20, face="bold"),
    axis.title.y = element_text(size=20, face="bold"),
    plot.title = element_text(size=20, face="bold"),
    axis.text.y = element_text(size=20, face="bold"),
    axis.text.x = element_text(size=20, face="bold"),
    strip.text.x = element_text(size=20,face="bold"),
    panel.background = element_blank()
  )

Shannon_pedigree

ggsave("./shannon_plot_pedigree_rerun.png", height=10, width=10, device="png")

Observed_pedigree <- plot_richness(ps.rarefied_ys_filtered, x="Corrected_pedigree", measures=c("Observed")) + geom_boxplot(fill="red") + 
  theme(
    legend.text = element_text(color = "black", size = 20),
    legend.title = element_text(color = "black", size = 20),
    axis.title.x = element_text(size=20, face="bold"),
    axis.title.y = element_text(size=20, face="bold"),
    plot.title = element_text(size=20, face="bold"),
    axis.text.y = element_text(size=20, face="bold"),
    axis.text.x = element_text(size=20, face="bold"),
    strip.text.x = element_text(size=20,face="bold"),
    panel.background = element_blank()
  )

Observed_pedigree

ggsave("./Observed_plot_pedigree_rerun.png", height=10, width=10, device="png")

Simpson_pedigree <- plot_richness(ps.rarefied_ys_filtered, x="Corrected_pedigree", measures=c("Simpson")) + geom_boxplot(fill="red") + 
  theme(
    legend.text = element_text(color = "black", size = 20),
    legend.title = element_text(color = "black", size = 20),
    axis.title.x = element_text(size=20, face="bold"),
    axis.title.y = element_text(size=20, face="bold"),
    plot.title = element_text(size=20, face="bold"),
    axis.text.y = element_text(size=20, face="bold"),
    axis.text.x = element_text(size=20, face="bold"),
    strip.text.x = element_text(size=20,face="bold"),
    panel.background = element_blank()
  )

# plot the shannon by location and by pedigree 
alpha_diversity_plot_location_pedigree <- ggarrange(Shannon_pedigree,Observed_pedigree,nrow = 1,ncol = 2)
alpha_diversity_plot_location_pedigree
ggsave("./alpha_diversity_plot_location_pedigree_rerun.png", height=9, width=12, device="png")

#alpha_diversity_plot_location <- ggarrange(Shannon_location,Observed_location,nrow = 1,ncol = 2)
#alpha_diversity_plot_location
#ggsave("./alpha_diversity_plot_location_rerun.png", height=9, width=12, device="png")

alpha_diversity_plot_location <- ggarrange(Shannon_location,Observed_location,nrow = 1,ncol = 2)
alpha_diversity_plot_location
ggsave("./alpha_diversity_plot_location_rerun.png", height=9, width=12, device="png")

####################################################################################################


##########################################################################

#functoin used to keep only duplicate pedigree in every location 
allDup <- function (value)
{
  duplicated(value) | duplicated(value, fromLast = TRUE)
}


#create a empty dataframe 
G2F_metadata_2019_duplicate_pedigree_ys_filtered <- as.data.frame(matrix(ncol = 50, nrow = 0))
#inherit colanems from metadata 
colnames(G2F_metadata_2019_duplicate_pedigree_ys_filtered) <- colnames(G2F_metadata_2019)

#filter the dataset make sure each location only contain samples with peidgree that has at least two replicates 
for (Location in unique(G2F_metadata_2019$location)) {
  print(Location)
  #subset dataset to each location 
  G2F_location_sample <- subset(G2F_metadata_2019,location==Location)
  #find samples with duplicate pedigree 
  G2F_location_all_duplcate_sample <- G2F_location_sample[allDup( G2F_location_sample$Corrected_pedigree),]
  #bind this location duplicate sample to big dataset 
  G2F_metadata_2019_duplicate_pedigree_ys_filtered <- rbind(G2F_metadata_2019_duplicate_pedigree_ys_filtered,G2F_location_all_duplcate_sample)
}

#check location and summary 
summary(G2F_metadata_2019_duplicate_pedigree_ys_filtered$location)
summary(G2F_metadata_2019_duplicate_pedigree_ys_filtered$Corrected_pedigree)
summary(G2F_metadata_2019_duplicate_pedigree_ys_filtered$pedigree)

summary(G2F_metadata_2019_duplicate_pedigree_ys_filtered$location)
summary(G2F_metadata_2019_duplicate_pedigree_ys_filtered$Corrected_pedigree)

#make sample name to be rowname of the dataframe
G2F_metadata_2019_duplicate_pedigree_ys_filtered$SampleID <- rownames(G2F_metadata_2019_duplicate_pedigree_ys_filtered)
rownames(G2F_metadata_2019_duplicate_pedigree_ys_filtered) <- NULL

#keep location that has more than 10 samples 
G2F_metadata_2019_duplicate_pedigree_ys_filtered_selected_location <- G2F_metadata_2019_duplicate_pedigree_ys_filtered %>% 
  group_by(location) %>% filter(n() >= 10)

##keep location has at least 3 unique pedigree
G2F_metadata_2019_duplicate_pedigree_ys_filtered_selected_location <- G2F_metadata_2019_duplicate_pedigree_ys_filtered_selected_location %>% 
  group_by(Corrected_pedigree) %>% filter(length(unique(location)) >= 3)

summary(G2F_metadata_2019_duplicate_pedigree_ys_filtered_selected_location$location)
summary(G2F_metadata_2019_duplicate_pedigree_ys_filtered_selected_location$Corrected_pedigree)

##check sample name 
G2F_metadata_2019_duplicate_pedigree_ys_filtered_selected_location$SampleID

#change . in sample name to - for later operation 
G2F_metadata_2019_duplicate_pedigree_ys_filtered_selected_location$SampleID <- gsub("\\.","-",G2F_metadata_2019_duplicate_pedigree_ys_filtered_selected_location$SampleID)

G2F_metadata_2019_duplicate_pedigree_ys_filtered_selected_location$SampleID

#G2F_metadata_2019_duplicate_pedigree_ys_filtered_selected_location$location <- as.character(G2F_metadata_2019_duplicate_pedigree_ys_filtered_selected_location$location)

##check location and pedigree 
summary(G2F_metadata_2019_duplicate_pedigree_ys_filtered_selected_location$location)
summary(G2F_metadata_2019_duplicate_pedigree_ys_filtered_selected_location$Corrected_pedigree)

G2F_metadata_2019_duplicate_pedigree_ys_filtered_selected_location$location <- as.factor(G2F_metadata_2019_duplicate_pedigree_ys_filtered_selected_location$location)
G2F_metadata_2019_duplicate_pedigree_ys_filtered_selected_location$Corrected_pedigree <- as.factor(G2F_metadata_2019_duplicate_pedigree_ys_filtered_selected_location$Corrected_pedigree)


G2F_metadata_2019_duplicate_pedigree_ys_filtered_selected_location$location <- as.character(G2F_metadata_2019_duplicate_pedigree_ys_filtered_selected_location$location)

##format to dataframe for string substitution
G2F_metadata_2019_duplicate_pedigree_ys_filtered_selected_location <- as.data.frame(G2F_metadata_2019_duplicate_pedigree_ys_filtered_selected_location)

#merge OH43/B37 and B37/OH43
G2F_metadata_2019_duplicate_pedigree_ys_filtered_selected_location$Corrected_pedigree <- gsub(".*^OH43/B37","B37/OH43",G2F_metadata_2019_duplicate_pedigree_ys_filtered_selected_location$Corrected_pedigree)

#create a heatmap to visualize the distribution of pedigree across the location 
correted_pedigree_heatmap <-ggplot(G2F_metadata_2019_duplicate_pedigree_ys_filtered_selected_location, aes(x=location,y=Corrected_pedigree)) + 
  geom_tile()

correted_pedigree_heatmap 


##################################################################################################################################


#visualize the alpha diversity 
ggplot(data = G2F_metadata_2019_duplicate_pedigree_ys_filtered_selected_location, aes(x=Shannon)) + geom_histogram()
ggplot(data = G2F_metadata_2019_duplicate_pedigree_ys_filtered_selected_location, aes(x=Observed)) + geom_histogram()
ggplot(data = G2F_metadata_2019_duplicate_pedigree_ys_filtered_selected_location, aes(x=Simpson)) + geom_histogram()

G2F_metadata_2019_duplicate_pedigree_ys_filtered_selected_location$location  <- as.factor(G2F_metadata_2019_duplicate_pedigree_ys_filtered_selected_location$location)

G2F_metadata_2019_duplicate_pedigree_ys_filtered_selected_location$Corrected_pedigree <- as.factor(G2F_metadata_2019_duplicate_pedigree_ys_filtered_selected_location$Corrected_pedigree)

####### analyze the G,E,and GXE on alpha diversity 

heribitility_calculation <- function(anova_result,alpha_name){
  
  #record the sum of square of each term (G,E,GXE)
  ssq_residue <- (anova_result$`Sum Sq`[4])
  #print(ssq_residue)
  ssq_location <- (anova_result$`Sum Sq`[1])
  ssq_pedigree <- (anova_result$`Sum Sq`[2])
  ssq_pedigree_location <- (anova_result$`Sum Sq`[3])
  
  heritbality_location <- (ssq_location/(ssq_residue + ssq_location + ssq_pedigree +ssq_pedigree_location))
  print("variation explained by Environment")
  print(heritbality_location)
  heritbality_pedigree <- (ssq_pedigree/(ssq_residue + ssq_location + ssq_pedigree + ssq_pedigree_location))
  print("variation explained by Maize genotype")
  print(heritbality_pedigree)
  heritbality_pedigree_location <- (ssq_pedigree_location/(ssq_residue + ssq_location + ssq_pedigree +ssq_pedigree_location))
  print("variation explained by GxE")
  print(heritbality_pedigree_location)
  
  #initialize the a list
  location_heritibility_list <- list()
  
  #record down the variance explained by each term (G,E,GXE)
  location_heritibility_list[1] <- heritbality_location
  location_heritibility_list[2] <- heritbality_pedigree
  location_heritibility_list[3] <- heritbality_pedigree_location
  
  #convert list to dataframe 
  location_heritibility_list <- as.data.frame(location_heritibility_list)
  
  #formulate the 
  result_df <- as.data.frame(matrix(c(alpha_name,alpha_name,alpha_name,"Environment","Maize Genotype","GXE"),nrow = 3))
  colnames(result_df) = c("alpha_diversity","test_var")
  result_df
  result_df$Value <- t(location_heritibility_list)
  result_df
  
  return(result_df)
}

##################################################################################################################

anova_permutation <- function(df){
  
  df = G2F_metadata_2019_duplicate_pedigree_ys_filtered_selected_location
  sum_of_square_df <- as.data.frame(matrix(ncol = 3, nrow = 0))
  colnames(sum_of_square_df) <- c("environment","genotype","GXE")
  
  E2_G1_GXE3_anova_results <- anova(lm(formula = Shannon ~ Corrected_pedigree + location + location:Corrected_pedigree , data = df))
  G_sum_1 <- E2_G1_GXE3_anova_results$`Sum Sq`[1]
  E_sum_2 <- E2_G1_GXE3_anova_results$`Sum Sq`[2]
  GXE_sum <- E2_G1_GXE3_anova_results$`Sum Sq`[3]
  
  E1_G2_GXE3_anova_results <- anova(lm(formula = Shannon ~ location + Corrected_pedigree + location:Corrected_pedigree, data = df))
  E_sum_1 <- E1_G2_GXE3_anova_results$`Sum Sq`[1]
  G_sum_2 <- E1_G2_GXE3_anova_results$`Sum Sq`[2]
  GXE_sum <- E1_G2_GXE3_anova_results$`Sum Sq`[3]
  temp_df <- data.frame(environment =mean(E_sum_1,E_sum_2), genotype=mean(G_sum_1,G_sum_2), GXE = GXE_sum)
  sum_of_square_df = rbind(sum_of_square_df,temp_df)
  
  return(sum_of_square_df)
}

########################################################################################

library(lmerTest)
#install.packages('EnvStats')
library(EnvStats)

# use test to eliminate outliers
test <- rosnerTest(G2F_metadata_2019_duplicate_pedigree_ys_filtered_selected_location$Shannon,k = 10)
test

#create G2F shannon dataset after removing outliers 
G2F_metadata_2019_shannon <- G2F_metadata_2019_duplicate_pedigree_ys_filtered_selected_location

#ggplot(data = G2F_metadata_2019_NO_sub_yellow_stripe_sub, aes(x=Shannon)) + geom_histogram()
ggplot(data = G2F_metadata_2019_shannon, aes(x=Shannon)) + geom_histogram()

summary(G2F_metadata_2019_duplicate_pedigree_ys_filtered_selected_location$location)

###############################################################################

lm_shannon_location_pedigree  <- lm(formula = Shannon ~ location + Corrected_pedigree + location:Corrected_pedigree, data = G2F_metadata_2019_shannon)
#shannon_assumption_check <- gvlma::gvlma(lm_shannon_location_pedigree)
#shannon_assumption_check
#plot(lm_shannon_location_pedigree)
#summary lm model
summary(lm_shannon_location_pedigree)
#contrast_setting <- list(location=contr.sum,Corrected_pedigree=contr.sum)
#run anova 

lm_shannon_location_pedigree_anova_results <- anova(lm_shannon_location_pedigree)
lm_shannon_location_pedigree_anova_results

#formulate a list for later visualization 
lm_shannon_result_list <- heribitility_calculation(lm_shannon_location_pedigree_anova_results,"shannon")
lm_shannon_result_list


##################################################################################


# use test to eliminate outliers
test <- rosnerTest(G2F_metadata_2019_duplicate_pedigree_ys_filtered_selected_location$Observed,
                   k = 10)
test

############

#visualize the observed feature data 
ggplot(data = G2F_metadata_2019_duplicate_pedigree_ys_filtered_selected_location, aes(x=Observed)) + geom_histogram()


#remove outliers 
G2F_metadata_2019_observed_features <- G2F_metadata_2019_duplicate_pedigree_ys_filtered_selected_location[-c(42,298,87),]
rownames(G2F_metadata_2019_observed_features) <- NULL

##visualize the observed feature data after filtation
ggplot(data = G2F_metadata_2019_observed_features, aes(x=Observed)) + geom_histogram()

#rownames(G2F_metadata_2019_observed_features) <- NULL
#run lm model
lm_observed_features_location_pedigree <- lm(log(Observed) ~  location +  Corrected_pedigree + location:Corrected_pedigree, data = G2F_metadata_2019_observed_features)
#lm_observed_features_location_pedigree <- lm(log(Observed) ~ location + Corrected_pedigree, data = G2F_metadata_2019_observed_features)
#lm_observed_features_location_pedigree <- glm(Observed ~ location + Corrected_pedigree + location:Corrected_pedigree, family="poisson", data=G2F_metadata_2019_observed_features)

#check assumption of linear regression model
#observed_features_assumption_check <- gvlma::gvlma(lm_observed_features_location_pedigree)
#observed_features_assumption_check
#plot(lm_observed_features_location_pedigree)
#summary lm model
summary(lm_observed_features_location_pedigree)
#run anova 
lm_observed_features_location_pedigree_anova_results <-  anova(lm_observed_features_location_pedigree)
#calculate heritibility 
#heribitility_calculation_alpha(lm_observed_features_location_pedigree_anova_results,'shannon')
lm_observed_features_result_list <- heribitility_calculation(lm_observed_features_location_pedigree_anova_results,"# of taxa")
lm_observed_features_result_list 

#lm_observed_features_location_pedigree_anova_results <- anova(lm(formula = log(Observed) ~ Corrected_pedigree + location:Corrected_pedigree + location, data = G2F_metadata_2019_observed_features))


##########################################################################################################

#visualize the Simpson index
ggplot(data = G2F_metadata_2019_duplicate_pedigree_ys_filtered_selected_location, aes(x=Simpson)) + geom_histogram()


# use test to eliminate outliers
test <- rosnerTest(G2F_metadata_2019_duplicate_pedigree_ys_filtered_selected_location$Simpson,
                   k = 10)
test


#remove outliers 
G2F_metadata_2019_simpson <- G2F_metadata_2019_duplicate_pedigree_ys_filtered_selected_location[-c(131,19,219,217,234,342),]

##visualize the Simpson index after filtation
ggplot(data = G2F_metadata_2019_simpson, aes(x=log(Simpson))) + geom_histogram()

G2F_metadata_2019_duplicate_pedigree_ys_filtered_selected_location$Simpson


#rownames(G2F_metadata_2019_observed_features) <- NULL
#run lm model
lm_simpson_location_pedigree <- lm(Simpson ~ location + Corrected_pedigree + location:Corrected_pedigree, data = G2F_metadata_2019_simpson)

#lm_simpson_location_pedigree <- lm(Simpson ~ location + Corrected_pedigree, data = G2F_metadata_2019_simpson)
#lm_simpson_location_pedigree <- glm(formula = Simpson ~ location + Corrected_pedigree, family = "poisson", data = G2F_metadata_2019_simpson)
#check assumption of linear regression model
#simpson_assumption_check <- gvlma::gvlma(lm_simpson_location_pedigree)
#simpson_assumption_check
#plot(lm_simpson_location_pedigree)

#summary lm model
summary(lm_simpson_location_pedigree)

#run anova 
lm_simpson_location_pedigree_anova_results <- anova(lm_simpson_location_pedigree)

#calculate heritibility 
lm_simpson_list <-heribitility_calculation(lm_simpson_location_pedigree_anova_results,"simpson")
lm_simpson_list


##########################################################################################################

#bind previous calculation together 
alpha_result <- dplyr::bind_rows(lm_observed_features_result_list,lm_shannon_result_list)
alpha_result <- dplyr::bind_rows(alpha_result,lm_simpson_list)


alpha_result


alpha_result$category <- " Alpha Diversity"

#plot the result in a bar graph 
ggplot(data=alpha_result, aes(x=alpha_diversity, y=Value, fill=test_var)) +
  geom_bar(position="stack", stat="identity") + ylab("variance explained") +  labs(fill = "category") + 
  scale_y_continuous(breaks=c(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8),limits = c(0,0.8)) +
  theme(axis.title = element_text(size = 20, face = "bold", colour = "grey20"), 
        panel.background = element_blank(),legend.title = element_text(size = 30, face = "bold", colour = "grey30"),
  ) + theme(
    legend.text = element_text(color = "black", size = 30),
    axis.title.y = element_text(size=30, face="bold"),
    plot.title = element_text(size=30, face="bold"),
    axis.text.y = element_text(size=30, face="bold"),
    axis.text.x = element_text(size=30, face="bold"),
    strip.text.x = element_text(size=30,face="bold"),
    legend.title = element_blank(),
    axis.title.x = element_blank(),
    legend.position="bottom",
  )

#output the graph
ggsave("./alpha_diversity_GXE_2.png", height=10, width=12, device="png")


###############################################################################################


beta_heritibility_cal <- function(input_qza,input_metadata,enviroment_start,enviroment_end){
  
  #extract distance matrix information
  distance_matrix <- input_qza$data$Vectors
  
  #get the sample name and first two pc
  distance_matrix <- distance_matrix[,c("SampleID","PC1","PC2")]
  print(distance_matrix)
  
  #subset the distance matrix to match the metadata
  distance_matrix <- subset(distance_matrix,SampleID %in% input_metadata$SampleID)
  rownames(distance_matrix) <- distance_matrix$SampleID
  distance_matrix$SampleID <- NULL
  print(distance_matrix)
  
  #subset the enviroment variable to check 
  environment= input_metadata[,enviroment_start:enviroment_end]
  
  #run dbrda with location, pedigree and GXE 
  dbRDA_result <- dbrda(distance_matrix ~ location + Corrected_pedigree + location:Corrected_pedigree, environment)
  print(dbRDA_result)
  #run anova 
  anova(dbRDA_result)
  
  #permutation to get p value 
  anova_result <- anova(dbRDA_result, by="terms", permu=100)
  print(anova_result)
  heribitility_calculation_beta(anova_result)
  
}


heribitility_calculation_beta <- function(anova_result){
  ssq_residue <- (anova_result$SumOfSqs[4])
  ssq_location <- (anova_result$SumOfSqs[1])
  ssq_pedigree <- (anova_result$SumOfSqs[2])
  ssq_pedigree_location <- (anova_result$SumOfSqs[3])
  
  heritbality_location <- (ssq_location/(ssq_residue + ssq_location + ssq_pedigree + ssq_pedigree_location))
  print(anova_result)
  print("variation explained by location")
  print(heritbality_location)
  heritbality_pedigree <- (ssq_pedigree/(ssq_residue + ssq_location + ssq_pedigree + ssq_pedigree_location))
  print("variation explained by pedigree")
  print( heritbality_pedigree)
  heritbality_pedigree <- (ssq_pedigree_location/(ssq_residue + ssq_location + ssq_pedigree + ssq_pedigree_location))
  print("variation explained by G*E")
  print(heritbality_pedigree)
  
}

###############################################################################

#analyze the beta diversity using weighted and unweighted unifrac 

#read in the weighted and unweighted unifrac distance matrix generated by qiime2 since phyloseq unifrac method have bugs 
weighted_unifrac <- read_qza("./weighted_unifrac_pcoa_results.qza")
unweighted_unifrac <- read_qza("./unweighted_unifrac_pcoa_results.qza")


##extract the cordniates values 
weighted_unifrac_cordination <- weighted_unifrac$data$Vectors
weighted_unifrac_cordination

#choose first two pc 
weighted_unifrac_cordination <- weighted_unifrac_cordination[,c("SampleID","PC1","PC2")]

#read in the metadata 
G2F_Metadata_2019	=	read.table("./G2f_2019_sub_corrected_metadata.tsv",sep ="\t",header = 1)

#convert the factor to character
G2F_Metadata_2019$location <- as.character(G2F_Metadata_2019$location)
G2F_metadata_2019_yellow_stripe_sub <- subset(G2F_Metadata_2019,location %in% c("INH1","MIH1","NCH1","NYH3","SCH1","IAH2","IAH4","OHH1","MNH1","MOH1","GAH2","NEH1","NEH2","DEH1"))

#create state group to enhance the visibility of weighted unifrac 
G2F_Metadata_2019 <- G2F_Metadata_2019 %>% mutate(State_group = case_when(
  location %in% c("MOH1","IAH2","IAH4","MNH1","NEH1","NEH2") ~ "Mid West",
  location %in% c("WIH1","INH1","MIH1","OHH1") ~ "East Mississippi River",
  location %in% c("GAH1","GAH2","SCH1","NCH1") ~ "South",
  location %in% c("NYH2","NYH3","DEH1") ~ "North East"
))


#combine the coordinates information with sample ID 
input_pcoa_cordination_metadata <- dplyr::inner_join(weighted_unifrac_cordination,G2F_Metadata_2019,by = c("SampleID"))
input_pcoa_cordination_metadata
#?inner_join()

#exctract % of explaination 
pc1 <- as.character(round(weighted_unifrac$data$ProportionExplained[1,1],3)*100)
pc2 <- as.character(round(weighted_unifrac$data$ProportionExplained[1,2],3)*100)

# generate x y axis labels 
pc1_label = paste0("PC1","(",pc1,"%)")
pc2_label = paste0("PC2","(",pc2, "%)")

### print out the weighted unifrac 

input_pcoa_plot <- ggplot(input_pcoa_cordination_metadata,aes(x=PC1, y=PC2, color=State_group)) + 
  geom_point(size = 4, alpha = 0.8) + xlab(pc1_label) + 
  ylab(pc2_label) + scale_colour_manual(values = c("purple","red", "steelblue","darkgreen")) +
  theme(axis.title = element_text(size = 10, face = "bold", colour = "grey30"), 
        panel.background = element_blank(), panel.border = element_rect(fill = NA, colour = "grey30"), 
        axis.ticks = element_blank(), legend.key = element_blank(), 
        legend.title = element_blank(),
        legend.text = element_text(size = 20, colour = "grey30",face = "bold"),
        axis.title.x = element_text(size=30, face="bold"),
        axis.title.y = element_text(size=30, face="bold"),
        plot.title = element_text(size=30, face="bold"),
        legend.position="bottom",
        axis.text = element_blank()
  )

input_pcoa_plot

#save the weighted unifrac graph
ggsave("./beta_diversity_weighted_unifrac.png", height=10, width=10, device="png")


# run dbrda analysis, function need distance matric, metadata table containing explantory variables, and column range of explantory variables
beta_heritibility_cal(weighted_unifrac,G2F_metadata_2019_duplicate_pedigree_ys_filtered_selected_location,1,49)
beta_heritibility_cal(unweighted_unifrac,G2F_metadata_2019_duplicate_pedigree_ys_filtered_selected_location,1,49)
colnames(G2F_metadata_2019_NO_sub_yellow_stripe_sub)

#build the weighted unifrac distance matrix 
weighted_unifrac_contribution_table <- as.data.frame(matrix(data=c(0.5071286,0.04521155,0.1668562),nrow=3,ncol=1),row.names = c("Environment","Maize Genotype","GXE"))
colnames(weighted_unifrac_contribution_table) <- c("Value")
weighted_unifrac_contribution_table$type <- c("Weighted")
weighted_unifrac_contribution_table$test_var<- rownames(weighted_unifrac_contribution_table)
weighted_unifrac_contribution_table

#build the unweighted unifrac distance matrix 
unweighted_unifrac_contribution_table <- as.data.frame(matrix(data=c(0.439707,0.04488889,0.1985142),nrow=3,ncol=1),row.names = c("Environment","Maize Genotype","GXE"))
colnames(unweighted_unifrac_contribution_table) <- c("Value")
unweighted_unifrac_contribution_table$type <- c("Unweighted")
unweighted_unifrac_contribution_table$test_var<- rownames(unweighted_unifrac_contribution_table)


#merge the wieghted and unweighted unifrac together 
unifrac_contribution_table <- dplyr::bind_rows(weighted_unifrac_contribution_table,unweighted_unifrac_contribution_table)
unifrac_contribution_table

#change the column from "alpha diversity to type for dplyr bind 
colnames(alpha_result)[1] <- "type"

#add category vairable to beta diversity table 
unifrac_contribution_table$category <- "Beta Diversity"
alpha_beta_diversity_table <- dplyr::bind_rows(unifrac_contribution_table,alpha_result)

#graph the contribution of G,E,and GXE on alpha and beta diversity 
ggplot(data=unifrac_contribution_table, aes(x=type, y=Value, fill=test_var)) +
  geom_bar(position="stack", stat="identity") + ylab("Variance explained") +
  scale_y_continuous(breaks=c(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.8),limits = c(0,0.8)) +
  theme(axis.title = element_text(size = 20, face = "bold", colour = "grey30"), 
        panel.background = element_blank(),legend.title = element_text(size = 30, face = "bold", colour = "grey30"),
  ) + theme(
    legend.text = element_text(color = "black", size = 30),
    axis.title.y = element_text(size=30, face="bold"),
    plot.title = element_text(size=30, face="bold"),
    axis.text.y = element_text(size=30, face="bold"),
    axis.text.x = element_text(size=28, face="bold"),
    strip.text.x = element_text(size=30,face="bold"),
    axis.title.x = element_blank(),
    legend.title = element_blank(),
    legend.position="bottom"
  )

#ggsave("./beta_diversity_weighted_unifrac.png", height=10, width=10, device="png")

#graph the contribution of G,E,and GXE on alpha and beta diversity 
ggplot(data=alpha_beta_diversity_table, aes(x=type, y=Value, fill=test_var)) +
  geom_bar(position="stack", stat="identity") + ylab("Variance explained") +
  scale_y_continuous(breaks=c(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.8),limits = c(0,0.8)) +
  theme(axis.title = element_text(size = 20, face = "bold", colour = "grey30"), 
        panel.background = element_blank(),legend.title = element_text(size = 30, face = "bold", colour = "grey30"),
  ) + theme(
    legend.text = element_text(color = "black", size = 30),
    axis.title.y = element_text(size=30, face="bold"),
    plot.title = element_text(size=30, face="bold"),
    axis.text.y = element_text(size=30, face="bold"),
    axis.text.x = element_text(size=28, face="bold"),
    strip.text.x = element_text(size=30,face="bold"),
    axis.title.x = element_blank(),
    legend.title = element_blank(),
    legend.position="bottom"
  )



ggsave("./alpha_beta_diversity_weighted_unifrac.png", height=10, width=12, device="png")


#######################################################################

location_core_microbiome <- function(taxonomic_level,prevalence_level){
  
  #create an empty dataframe 
  core_taxa_list <- as.data.frame(matrix(ncol = 2, nrow = 0))
  #add colname 
  colnames(core_taxa_list) <- c("Taxaname","location")
  #change column type so that can be used later 
  core_taxa_list$Taxaname <- as.character(core_taxa_list$Taxaname)
  core_taxa_list$location <- as.character(core_taxa_list$location)
  
  #iterate through each location and get phylumn level taxa that have input prevalence prevalence 
  
  for(experiment_field in unique(G2F_metadata_2019_duplicate_pedigree_ys_filtered_selected_location$location)){
    
    print(experiment_field)
    #create read path
    directory_name = paste("./core-metrics-results-dada2_table-no-mitochondria-no-chloroplast-blank-filtered-yellow-stripe-duplicate-pedigree-",experiment_field,"-1500",sep="")
    print(directory_name)
    path_name = paste(directory_name,"/rarefied_table_",taxonomic_level,"_level_taxonomy_",prevalence_level,"_filtered.qza",sep="")
    print(path_name)
    #read into qiime 2 qza file and take the data 
    core_taxa <- read_qza(path_name)
    core_taxa <- as.data.frame(core_taxa$data)
    core_taxa$Taxaname <- rownames(core_taxa)
    core_taxa$location <- experiment_field
    #record down the name of core taxa and bind wih
    location_core_taxa <- core_taxa[,c("Taxaname","location")]
    core_taxa_list <- dplyr::bind_rows(core_taxa_list,location_core_taxa)
  }
  
  length(unique(core_taxa_list$Taxaname))
  length(unique(core_taxa_list$location))
  
  return(core_taxa_list)
  
}

core_microbiome_list_to_matrix <- function(core_taxa_list){
  
  #build a dataframe based on core taxa list of all location 
  core_taxa_matrix <- as.data.frame(matrix(ncol = length(unique(core_taxa_list$location)), nrow = length(unique(core_taxa_list$Taxaname))))
  colnames(core_taxa_matrix) <- unique(core_taxa_list$location)
  rownames(core_taxa_matrix) <- unique(core_taxa_list$Taxaname)
  
  #make a matrix table of core taxa, if core taxa exist in a location, add 1 
  for(taxa_name in unique(core_taxa_list$Taxaname)){
    print(taxa_name)
    temp_location_taxa_combination <- subset(core_taxa_list,Taxaname == taxa_name)
    for(location in temp_location_taxa_combination$location){
      print(location)
      core_taxa_matrix[taxa_name,location] <- 1
    }
  }
  
  # if core taxa do not present in a location, change na to 0 
  core_taxa_matrix[is.na(core_taxa_matrix)] <- 0
  
  return(core_taxa_matrix)
  
}
#core microbiome 

##############################################################################################

location_species_core_microbiome_list <- location_core_microbiome(7,0.6)
location_genus_core_microbiome_list <- location_core_microbiome(6,0.6)
location_family_core_microbiome_list <- location_core_microbiome(5,0.6)
location_order_core_microbiome_list <- location_core_microbiome(4,0.6)
location_class_core_microbiome_list <- location_core_microbiome(3,0.6)
location_phylumn_core_microbiome_list <- location_core_microbiome(2,0.6)

unique_location_genus_core_microbiome_list <- location_genus_core_microbiome_list  %>% group_by(Taxaname) %>% filter(n() <= 1)
unique_location_family_core_microbiome_list <- location_family_core_microbiome_list  %>% group_by(Taxaname) %>% filter(n() <= 1)
unique_location_order_core_microbiome_list <- location_order_core_microbiome_list  %>% group_by(Taxaname) %>% filter(n() <= 1)
unique_location_class_core_microbiome_list <- location_class_core_microbiome_list  %>% group_by(Taxaname) %>% filter(n() <= 1)
unique_location_phylumn_core_microbiome_list <- location_phylumn_core_microbiome_list  %>% group_by(Taxaname) %>% filter(n() <= 1)

common_location_genus_core_microbiome_list <- location_genus_core_microbiome_list  %>% group_by(Taxaname) %>% filter(n() >= 10)
common_location_family_core_microbiome_list <- location_family_core_microbiome_list  %>% group_by(Taxaname) %>% filter(n() >= 10)
common_location_order_core_microbiome_list <- location_order_core_microbiome_list  %>% group_by(Taxaname) %>% filter(n() >= 10)
common_location_class_core_microbiome_list <- location_class_core_microbiome_list  %>% group_by(Taxaname) %>% filter(n() >= 10)
common_location_phylumn_core_microbiome_list <- location_phylumn_core_microbiome_list  %>% group_by(Taxaname) %>% filter(n() >= 10)

location_genus_core_microbiome_matrix <- core_microbiome_list_to_matrix(location_genus_core_microbiome_list)
location_genus_core_microbiome_matrix

location_family_core_microbiome_matrix <- core_microbiome_list_to_matrix(location_family_core_microbiome_list)
location_family_core_microbiome_matrix


location_family_core_microbiome_list[c('Domain','Phylumn','Class','Order','Family')]  <- str_split_fixed(location_family_core_microbiome_list$Taxaname, ';',5)


location_family_core_microbiome_heatmap <-ggplot(location_family_core_microbiome_list, aes(x=location,y=Family)) + 
  geom_tile() +  theme(
    axis.text.y = element_text(size=15, face="bold"),
    axis.text.x = element_text(size=15,angle=90, face="bold"),
    axis.title.x = element_text(size=15,face="bold"),
    axis.title.y = element_text(size=15, face="bold")
    
  )

location_family_core_microbiome_heatmap



ggsave("./location_family_core_microbiome_heatmap.png",plot = location_family_core_microbiome_heatmap,core_taxa_plot,height=8, width=12, device="png")

##############################################################################################
Corrected_pedigree_phylumn_60_core_taxa_list <- as.data.frame(matrix(ncol = 2, nrow = 0))
#add colname 
colnames(Corrected_pedigree_phylumn_60_core_taxa_list) <- c("Taxaname","Corrected_pedigree")
#change column type so that can be used later 
Corrected_pedigree_phylumn_60_core_taxa_list$Taxaname <- as.character(Corrected_pedigree_phylumn_60_core_taxa_list$Taxaname)
Corrected_pedigree_phylumn_60_core_taxa_list$Corrected_pedigree <- as.character(Corrected_pedigree_phylumn_60_core_taxa_list$Corrected_pedigree)

#iterate through each Corrected_pedigree and get family level taxa that have 60% prevalence 

for(pedigree in unique(G2F_metadata_2019_duplicate_pedigree_ys_filtered_selected_location$Corrected_pedigree)){
  
  #print(pedigree)
  pedigree = gsub("\\/","X",pedigree)
  print(pedigree)
  #create read path
  directory_name = paste("./core-metrics-results-dada2_table-no-mitochondria-no-chloroplast-blank-filtered-yellow-stripe-duplicate-pedigree-",pedigree,"-1500",sep="")
  print(directory_name)
  path_name = paste(directory_name,"/rarefied_table_5_level_taxonomy_0.6_filtered.qza",sep="")
  print(path_name)
  #read into qiime 2 qza file and take the data 
  phylumn_60_filtered_core_taxa <- read_qza(path_name)
  phylumn_60_filtered_core_taxa <- as.data.frame(phylumn_60_filtered_core_taxa$data)
  phylumn_60_filtered_core_taxa$Taxaname <- rownames(phylumn_60_filtered_core_taxa)
  phylumn_60_filtered_core_taxa$Corrected_pedigree <- pedigree
  #record down the name of core taxa and bind wih
  Corrected_pedigree_core_taxa <- phylumn_60_filtered_core_taxa[,c("Taxaname","Corrected_pedigree")]
  Corrected_pedigree_phylumn_60_core_taxa_list <- dplyr::bind_rows(Corrected_pedigree_phylumn_60_core_taxa_list,Corrected_pedigree_core_taxa)
}

length(unique(Corrected_pedigree_phylumn_60_core_taxa_list$Taxaname))
length(unique(Corrected_pedigree_phylumn_60_core_taxa_list$Corrected_pedigree))

#build a dataframe based on core taxa list of all Corrected_pedigree 
Corrected_pedigree_phylumn_60_core_taxa_matrix <- as.data.frame(matrix(ncol = length(unique(Corrected_pedigree_phylumn_60_core_taxa_list$Corrected_pedigree)), nrow = length(unique(Corrected_pedigree_phylumn_60_core_taxa_list$Taxaname))))
colnames(Corrected_pedigree_phylumn_60_core_taxa_matrix) <- unique(Corrected_pedigree_phylumn_60_core_taxa_list$Corrected_pedigree)
rownames(Corrected_pedigree_phylumn_60_core_taxa_matrix) <- unique(Corrected_pedigree_phylumn_60_core_taxa_list$Taxaname)

#Corrected_pedigree_phylumn_60_core_taxa_matrix["d__Bacteria;p__Proteobacteria","SCH1"]

#make a matrix table of core taxa, if core taxa exist in a Corrected_pedigree, add 1 
for(taxa_name in unique(Corrected_pedigree_phylumn_60_core_taxa_list$Taxaname)){
  print(taxa_name)
  temp_Corrected_pedigree_taxa_combination <- subset(Corrected_pedigree_phylumn_60_core_taxa_list,Taxaname == taxa_name)
  for(Corrected_pedigree in temp_Corrected_pedigree_taxa_combination$Corrected_pedigree){
    print(Corrected_pedigree)
    Corrected_pedigree_phylumn_60_core_taxa_matrix[taxa_name,Corrected_pedigree] <- 1
  }
}

# if core taxa do not present in a Corrected_pedigree, change na to 0 
Corrected_pedigree_phylumn_60_core_taxa_matrix[is.na(Corrected_pedigree_phylumn_60_core_taxa_matrix)] <- 0
Corrected_pedigree_phylumn_60_core_taxa_matrix

Corrected_pedigree_phylumn_60_core_taxa_list

Corrected_pedigree_phylumn_60_core_taxa_list[c('Domain','Phylumn','Class','Order','Family')]  <- str_split_fixed(Corrected_pedigree_phylumn_60_core_taxa_list$Taxaname, ';',5)


Corrected_pedigree_family_core_microbiome_heatmap <-ggplot(Corrected_pedigree_phylumn_60_core_taxa_list, aes(x=Corrected_pedigree,y=Family)) + 
  geom_tile() +  theme(
    axis.text.y = element_text(size=15, face="bold"),
    axis.text.x = element_text(size=15,angle=90, face="bold"),
    axis.title.x = element_text(size=15,face="bold"),
    axis.title.y = element_text(size=15, face="bold")
    
  )

Corrected_pedigree_family_core_microbiome_heatmap

###########################################################################################################




################################################################

Pvaluecal <- function(modelsummary){
  modelsummary <- unlist(modelsummary)
  f_value <- modelsummary$fstatistic.value
  numdf <- modelsummary$fstatistic.numdf
  dendf <- modelsummary$fstatistic.dendf
  p <- pf(f_value,numdf,dendf,lower.tail = F)
  return(p)
}

###################################################
#investigate the median alpha diversity in an location vs enviromental factors
# if plyr package is loaded, group by fucntion in dplyr package will not work 
G2F_2019_median_alpha_diversity_by_location_data <- G2F_metadata_2019 %>% 
  group_by(location) %>% summarize(median_shannon = median(Shannon),
                                   median_observed_features = median(Observed),
                                   median_simpson = median(Simpson),
  )




G2f_2019_soil_data <- read.csv("./g2f_2019_soil_data.csv",sep = ",")
G2f_2019_weather_data <- read.csv("./G2F_2019_weather_aervage_and_sum.tsv",sep = "\t")

G2F_2019_median_alpha_diversity_by_location_with_soil_data <- dplyr::left_join(G2F_2019_median_alpha_diversity_by_location_data,G2f_2019_soil_data,by = c("location" = "Location"))
G2F_2019_median_alpha_diversity_by_location_with_soil_weather_data <- dplyr::left_join(G2F_2019_median_alpha_diversity_by_location_with_soil_data,G2f_2019_weather_data,by = c("location" = "location"))
#remove NCH1 location becuase it does not contain any soil data 
G2F_2019_median_alpha_diversity_by_location_with_soil_weather_data <- G2F_2019_median_alpha_diversity_by_location_with_soil_weather_data[-c(9),]
G2F_2019_median_alpha_diversity_by_location_with_soil_weather_data <- as.data.frame(G2F_2019_median_alpha_diversity_by_location_with_soil_weather_data)

median_alpha_environment_group <- G2F_2019_median_alpha_diversity_by_location_with_soil_weather_data[,c("median_shannon","median_observed_features","median_simpson","X1.1.Soil.pH","WDRF.Buffer.pH","X1.1.S.Salts.mmho.cm","Organic.Matter.LOI..","Nitrate.N.ppm.N","lbs.N.A",                   
                                                                                                        "Potassium.ppm.K","Sulfate.S.ppm.S","Calcium.ppm.Ca","Magnesium.ppm.Mg","Sodium.ppm.Na","CEC.Sum.of.Cations.me.100g","Temperature..C.","Relative.Humidity....","Rainfall..mm.")]
median_alpha_environment_group 

###############################################################################################

shannon_result <- lapply(median_alpha_environment_group, function(x) lm(median_shannon ~  x, data = median_alpha_environment_group ))
shannon_result_summary <- lapply(shannon_result,summary)
shannon_result_summary

median_shannon_ph <- ggplot(median_alpha_environment_group , aes(x=X1.1.Soil.pH, y=median_shannon)) + geom_point(size = 4) +
  geom_smooth(method=lm, se=FALSE, fullrange=TRUE)

median_shannon_ph




######################################################################################################

observed_features_result <- lapply(median_alpha_environment_group, function(x) lm(median_observed_features ~  x, data = median_alpha_environment_group))
observed_features_result_summary <- lapply(observed_features_result,summary)
observed_features_result_summary

median_observed_features_ph <- ggplot(median_alpha_environment_group , aes(x=X1.1.Soil.pH, y=median_observed_features)) + geom_point(size = 4) +
  geom_smooth(method=lm, se=FALSE, fullrange=TRUE)


median_observed_features_ph

########################################################################################################

simpson_result <- lapply(median_alpha_environment_group, function(x) lm(median_simpson ~  x, data = median_alpha_environment_group))
simpson_result_summary <- lapply(simpson_result,summary)
simpson_result_summary

median_simpson_ph <- ggplot(median_alpha_environment_group , aes(x=X1.1.Soil.pH, y=median_simpson)) + geom_point(size = 4) +
  geom_smooth(method=lm, se=FALSE, fullrange=TRUE)

median_simpson_ph

###########################################################################################################3


soil_pH_shannon_r_square <- shannon_result_summary$X1.1.Soil.pH$adj.r.squared
soil_pH_observed_features_r_square  <- observed_features_result_summary$X1.1.Soil.pH$adj.r.squared
soil_pH_simpson_r_square  <- simpson_result_summary$X1.1.Soil.pH$adj.r.squared

soil_pH_shannon_P_value <- round(Pvaluecal(shannon_result_summary$X1.1.Soil.pH),digits = 3)
soil_pH_observed_features_P_value <- round(Pvaluecal(observed_features_result_summary$X1.1.Soil.pH),digits = 3)
soil_pH_simpson_P_value <- round(Pvaluecal(simpson_result_summary$X1.1.Soil.pH),digits = 3)


simpson_result_summary$CEC.Sum.of.Cations.me.100g

soil_pH_shannon_r_square
soil_pH_observed_features_r_square 
soil_pH_simpson_r_square 


soil_pH_shannon_P_value
soil_pH_observed_features_P_value
soil_pH_simpson_P_value

###############################################################################################################


median_shannon_ph <- ggplot(median_alpha_environment_group, aes(x=X1.1.Soil.pH, y=median_shannon)) + geom_point(size = 4) +
  geom_smooth(method=lm, se=FALSE, fullrange=TRUE) + scale_y_continuous(breaks=c(2.7,2.9,3.1,3.3,3.5,3.7,3.9),limits = c(2.7,3.9)) +
  xlab("Soil pH") +  ylab("median Shannon Index") +
  theme(
    legend.text = element_text(color = "black", size = 30),
    legend.title = element_text(color = "black", size = 30),
    axis.title.x = element_text(size=30, face="bold"),
    axis.title.y = element_text(size=30, face="bold"),
    plot.title = element_text(size=30, face="bold"),
    axis.text.y = element_text(size=30, face="bold"),
    axis.text.x = element_text(size=30, face="bold"),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black")
  ) + annotate("text",
               x = c(7,7),
               y = c(3.5,3.6),
               label = c(paste("p-value = ",soil_pH_shannon_P_value),expression("R"^2~"= 0.414")),size=8)

median_shannon_ph


#ggsave("/home/hl46161/new_G2F_dada2/exported_table_phyloseq/median_shannon_ph.png", height=10, width=10, device="png")

median_ob_ph <- ggplot(median_alpha_environment_group , aes(x=X1.1.Soil.pH, y=median_observed_features)) + geom_point(size = 4) +
  geom_smooth(method=lm, se=FALSE, fullrange=TRUE) +
  xlab("Soil pH") +  ylab("median Observed Features") +
  theme(
    legend.text = element_text(color = "black", size = 30),
    legend.title = element_text(color = "black", size = 30),
    axis.title.x = element_text(size=30, face="bold"),
    axis.title.y = element_text(size=30, face="bold"),
    plot.title = element_text(size=30, face="bold"),
    axis.text.y = element_text(size=30, face="bold"),
    axis.text.x = element_text(size=30, face="bold"),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black")
  ) + annotate("text",
               x = c(7,7),
               y = c(40,43),
               label = c(paste("p-value = ",soil_pH_observed_features_P_value),expression("R"^2~"= 0.304")),size=8)

median_ob_ph

#ggsave("/home/hl46161/new_G2F_dada2/exported_table_phyloseq/median_observed_features_ph.png", height=10, width=10, device="png")


median_simpson_ph <- ggplot(median_alpha_environment_group, aes(x=X1.1.Soil.pH, y=median_simpson)) + geom_point(size = 4) +
  geom_smooth(method=lm, se=FALSE, fullrange=TRUE) +
  xlab("Soil pH") +  ylab("median Simpson Index") +
  theme(
    legend.text = element_text(color = "black", size = 30),
    legend.title = element_text(color = "black", size = 30),
    axis.title.x = element_text(size=30, face="bold"),
    axis.title.y = element_text(size=30, face="bold"),
    plot.title = element_text(size=30, face="bold"),
    axis.text.y = element_text(size=30, face="bold"),
    axis.text.x = element_text(size=30, face="bold"),
    panel.background = element_blank(),
    axis.line = element_line(colour = "black")
  ) + annotate("text",
               x = c(7.1,7.1),
               y = c(0.96,0.963),
               label = c(paste("p-value = ",soil_pH_simpson_P_value),expression("R"^2~"= 0.493")),size=8)

median_simpson_ph


#ggsave("/home/hl46161/new_G2F_dada2/exported_table_phyloseq/average_simpson_ph.png", height=10, width=10, device="png")


median_alpha_diversity_plot <- ggarrange(median_shannon_ph,median_simpson_ph,median_ob_ph,nrow = 1,ncol = 3)
median_alpha_diversity_plot

ggsave("./median_alpha_diversity_plot.png", height=8, width=24, device="png")

##############################################################################################

calc_distance_vector <- function(DM){rownames(DM) <- DM$X
##delete previous sample names column
DM$X <-NULL
## reorder the dataframe by rownames and column names
DM <-DM[order(row.names(DM)),  order(colnames(DM))]  #set up loop numbers
row_loop=nrow(DM)
print(row_loop)
column_loop=ncol(DM)-1
print(column_loop)
##set up an emptry vector
unifrac_distance_vector <- c(0)
###  ###### loop trhough the lower triangle of the matrix and add elements to the vector
while(row_loop <=nrow(DM) & row_loop >1){
  for(i in column_loop:1){
    #print(i)
    #print(DM[row_loop,i])
    unifrac_distance_vector <- append(unifrac_distance_vector,DM[row_loop,i])
  }
  column_loop=column_loop-1
  row_loop = row_loop-1
}
### remove first zero
unifrac_distance_vector <- unifrac_distance_vector[2:length(unifrac_distance_vector)]
unifrac_distance_vector
return(unifrac_distance_vector)
}

############## lm between distance matrix
# environmental factor distance between each location are generated by qiime2 using ASV table excluding NCH1

Relative_Humidity_distance <- read_qza("./core-metrics-results-dada2_table-no-mitochondria-no-chloroplast-blank-filtered-yellow-stripe-duplicate-pedigree-no-NCH1-group-by-location-from-rarefied_table-18000/Relative.Humidity...._distance_matrix.qza")
Potassium_distance <- read_qza("./core-metrics-results-dada2_table-no-mitochondria-no-chloroplast-blank-filtered-yellow-stripe-duplicate-pedigree-no-NCH1-group-by-location-from-rarefied_table-18000/Potassium.ppm.K_distance_matrix.qza")
Solar_radiation_distance <- read_qza("./core-metrics-results-dada2_table-no-mitochondria-no-chloroplast-blank-filtered-yellow-stripe-duplicate-pedigree-no-NCH1-group-by-location-from-rarefied_table-18000/Solar.Radiation..W.m2._distance_matrix.qza")
Temperature_distance <- read_qza("./core-metrics-results-dada2_table-no-mitochondria-no-chloroplast-blank-filtered-yellow-stripe-duplicate-pedigree-no-NCH1-group-by-location-from-rarefied_table-18000/Temperature..C._distance_matrix.qza")


location_weighted_unifrac <- read_qza("./core-metrics-results-dada2_table-no-mitochondria-no-chloroplast-blank-filtered-yellow-stripe-duplicate-pedigree-no-NCH1-group-by-location-from-rarefied_table-18000/weighted_unifrac_distance_matrix.qza")
location_unweighted_unifrac <- read_qza("./core-metrics-results-dada2_table-no-mitochondria-no-chloroplast-blank-filtered-yellow-stripe-duplicate-pedigree-no-NCH1-group-by-location-from-rarefied_table-18000/unweighted_unifrac_distance_matrix.qza")

location_weighted_unifrac <- as.data.frame(as.matrix(location_weighted_unifrac$data))
location_unweighted_unifrac <- as.data.frame(as.matrix(location_unweighted_unifrac$data))


Relative_Humidity_distance <- as.data.frame(as.matrix(Relative_Humidity_distance$data))
Potassium_distance <- as.data.frame(as.matrix(Potassium_distance$data))
Solar_radiation_distance <- as.data.frame(as.matrix(Solar_radiation_distance$data))
Temperature_distance <- as.data.frame(as.matrix(Temperature_distance$data))


location_names <- rownames(location_weighted_unifrac)
factor_location_names <- rownames(Potassium_distance)
intersect_names <- intersect(location_names,factor_location_names)

Relative_Humidity_distance<- Relative_Humidity_distance[c(intersect_names),c(intersect_names)]
Potassium_distance <- Potassium_distance[c(intersect_names),c(intersect_names)]
Temperature_distance <- Temperature_distance[c(intersect_names),c(intersect_names)]
Solar_radiation_distance <- Solar_radiation_distance[c(intersect_names),c(intersect_names)]


Relative_Humidity_distance_vector <- calc_distance_vector(Relative_Humidity_distance)
Potassium_distance_vector  <- calc_distance_vector(Potassium_distance)
Solar_radiation_distance_vector  <- calc_distance_vector(Solar_radiation_distance)
Temperature_distance_vector  <- calc_distance_vector(Temperature_distance)

#exctract the unifrac distance into a vector 
weighted_unifrac_distance_vector <- calc_distance_vector(location_weighted_unifrac)
weighted_unifrac_distance_vector

#exctract the unifrac distance into a vector
unweighted_unifrac_distance_vector <- calc_distance_vector(location_unweighted_unifrac)
unweighted_unifrac_distance_vector

## summarize into a dataframe 
G2F_distance_data <- data.frame(Temperature_distance_vector,Solar_radiation_distance_vector,Relative_Humidity_distance_vector,Potassium_distance_vector,weighted_unifrac_distance_vector,unweighted_unifrac_distance_vector)
G2F_distance_data


summary(lm(weighted_unifrac_distance_vector ~ Relative_Humidity_distance_vector,data=G2F_distance_data))
summary(lm(weighted_unifrac_distance_vector ~ Potassium_distance_vector,data=G2F_distance_data))
summary(lm(weighted_unifrac_distance_vector ~ Solar_radiation_distance_vector,data=G2F_distance_data))
summary(lm(weighted_unifrac_distance_vector ~ Temperature_distance_vector,data=G2F_distance_data))

summary(lm(unweighted_unifrac_distance_vector ~ Relative_Humidity_distance_vector,data=G2F_distance_data))
summary(lm(unweighted_unifrac_distance_vector ~ Potassium_distance_vector,data=G2F_distance_data))
summary(lm(unweighted_unifrac_distance_vector ~ Solar_radiation_distance_vector,data=G2F_distance_data))
summary(lm(unweighted_unifrac_distance_vector ~ Temperature_distance_vector,data=G2F_distance_data))

weighted_unifrac_Potassium_distance_mantel <- mantel(Potassium_distance,location_weighted_unifrac,method = "pearson",permutation=999)
unweighted_unifrac_Potassium_distance_mantel <- mantel(Potassium_distance,location_unweighted_unifrac,method = "pearson",permutation=999)

weighted_unifrac_Temperature_distance_mantel <- mantel(Temperature_distance,location_weighted_unifrac,method = "pearson",permutation=999)
unweighted_unifrac_Temperature_distance_mantel <- mantel(Temperature_distance,location_unweighted_unifrac,method = "pearson",permutation=999)

weighted_unifrac_Relative_Humidity_distance_mantel <- mantel(Relative_Humidity_distance,location_weighted_unifrac,method = "pearson",permutation=999)
unweighted_unifrac_Relative_Humidity_distance_mantel <- mantel(Relative_Humidity_distance,location_unweighted_unifrac,method = "pearson",permutation=999)

weighted_unifrac_Solar_radiation_distance_mantel <- mantel(Solar_radiation_distance,location_weighted_unifrac,method = "pearson",permutation=999)
unweighted_unifrac_Solar_radiation_distance_mantel <- mantel(Solar_radiation_distance,location_unweighted_unifrac,method = "pearson",permutation=999)



weighted_unifrac_Potassium_distance_mantel
unweighted_unifrac_Potassium_distance_mantel

weighted_unifrac_Temperature_distance_mantel
unweighted_unifrac_Temperature_distance_mantel


weighted_unifrac_Relative_Humidity_distance_mantel
unweighted_unifrac_Relative_Humidity_distance_mantel 

weighted_unifrac_Solar_radiation_distance_mantel
unweighted_unifrac_Solar_radiation_distance_mantel


###############################################################################################################


Relative_Humidity_weighted <- ggplot(data=G2F_distance_data, aes(x=Relative_Humidity_distance_vector, y=weighted_unifrac_distance_vector)) + 
  geom_point(size=3,color="black") + scale_color_manual(values=c("cyan")) +
  xlab("distance in relative humidity") +
  ylab("weighted unifrac distance") +  theme(
    legend.text = element_text(color = "black", size = 30),
    legend.title = element_text(color = "black", size = 30),
    axis.title.x = element_text(size=30, face="bold"),
    axis.title.y = element_text(size=30, face="bold"),
    plot.title = element_text(size=30, face="bold"),
    axis.text.y = element_text(size=30, face="bold"),
    axis.text.x = element_text(size=30, face="bold"),
  ) + geom_smooth(method="lm",color="blue") + 
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black")) +
  annotate("text",x = c(10,10),y = c(0.28,0.30), label =  c(paste("p-value = ",weighted_unifrac_Relative_Humidity_distance_mantel$signif),expression("R"^2~"= 0.335")),size=8)

Relative_Humidity_weighted

#ggsave("./Relative_Humidity_weighted.png", height=10, width=10, device="png")


Potassium_vector_weighted <- ggplot(data=G2F_distance_data, aes(x=Potassium_distance_vector, y=weighted_unifrac_distance_vector)) + 
  geom_point(size=3,color="black") + scale_color_manual(values=c("cyan")) +
  xlab("distance in potassium") +
  ylab("weighted unifrac distance") +  theme(
    legend.text = element_text(color = "black", size = 30),
    legend.title = element_text(color = "black", size = 30),
    axis.title.x = element_text(size=30, face="bold"),
    axis.title.y = element_text(size=30, face="bold"),
    plot.title = element_text(size=30, face="bold"),
    axis.text.y = element_text(size=30, face="bold"),
    axis.text.x = element_text(size=30, face="bold"),
  ) + geom_smooth(method="lm",color="blue") +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black")) +
  annotate("text",x = c(470,490),y = c(0.28,0.30), label = c(paste("p-value = ",weighted_unifrac_Potassium_distance_mantel$signif),expression("R"^2~"= 0.394")),size=8)

Potassium_vector_weighted


Temperature_vector_weighted <- ggplot(data=G2F_distance_data, aes(x=Temperature_distance_vector, y=weighted_unifrac_distance_vector)) + 
  geom_point(size=3,color="black") + scale_color_manual(values=c("cyan")) +
  xlab("distance in potassium") +
  ylab("weighted unifrac distance") +  theme(
    legend.text = element_text(color = "black", size = 30),
    legend.title = element_text(color = "black", size = 30),
    axis.title.x = element_text(size=30, face="bold"),
    axis.title.y = element_text(size=30, face="bold"),
    plot.title = element_text(size=30, face="bold"),
    axis.text.y = element_text(size=30, face="bold"),
    axis.text.x = element_text(size=30, face="bold"),
  ) + geom_smooth(method="lm",color="blue") +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black")) +
  annotate("text",x = c(7.5,7.5),y = c(0.28,0.30), label = c(paste("p-value = ",weighted_unifrac_Temperature_distance_mantel$signif),expression("R"^2~"= 0.373")),size=8)

Temperature_vector_weighted 



G2F_environment_weighted_plot <- ggarrange(Potassium_vector_weighted,Temperature_vector_weighted,nrow = 1,ncol = 2)
G2F_environment_weighted_plot 

ggsave("./G2F_environment_weighted_plot.png", height=10, width=20, device="png")


###########################################################################################################


#choose the environmental variables to test 
environment_factors <- colnames(G2F_metadata_2019_duplicate_pedigree_ys_filtered_selected_location)
environment_factors <- c("X1.1.Soil.pH","WDRF.Buffer.pH","X1.1.S.Salts.mmho.cm","Organic.Matter.LOI..","Nitrate.N.ppm.N","lbs.N.A",                   
                         "Potassium.ppm.K","Sulfate.S.ppm.S","Calcium.ppm.Ca","Magnesium.ppm.Mg","CEC.Sum.of.Cations.me.100g","Temperature..C.","Relative.Humidity....","Rainfall..mm.","Solar.Radiation..W.m2.")

##set up an emptry vector
significant_association_weighted_unifrac_genotype_environment <- c(0)
significant_association_unweighted_unifrac_genotype_environment <- c(0)



for (genotype in unique(G2F_metadata_2019_duplicate_pedigree_ys_filtered_selected_location$Corrected_pedigree)){
  
  #iterate trhough different genotype 
  print(genotype)
  #change genotype name to match with metadata
  changed_genotype <- gsub("\\/","X",genotype)
  directory_name = paste("./core-metrics-results-dada2_table-no-mitochondria-no-chloroplast-blank-filtered-yellow-stripe-duplicate-pedigree-",changed_genotype,"-by-location-3000",sep="")
  print(directory_name)
  #create readin name for weighted and unweighted unifrac 
  weighted_path_name = paste(directory_name,"/weighted_unifrac_distance_matrix.qza",sep="")
  unweighted_path_name = paste(directory_name,"/unweighted_unifrac_distance_matrix.qza",sep="")
  print(path_name)
  
  #read in weighted unifrac distance 
  genotype_location_weighted_unifrac  <- read_qza(weighted_path_name)
  genotype_location_weighted_unifrac <- genotype_location_weighted_unifrac$data
  genotype_location_weighted_unifrac <- as.matrix(genotype_location_weighted_unifrac)
  location_names <- rownames(genotype_location_weighted_unifrac)
  
  #read in unweighted unifrac distance
  genotype_location_unweighted_unifrac  <- read_qza(unweighted_path_name)
  genotype_location_unweighted_unifrac <- genotype_location_unweighted_unifrac$data
  genotype_location_unweighted_unifrac <- as.matrix(genotype_location_unweighted_unifrac)
  location_names <- rownames(genotype_location_unweighted_unifrac)
  
  for (environment_factor in environment_factors){
    
    #read in environmental factor distance 
    print(environment_factor)
    environment_factor_name = paste("./core-metrics-results-dada2_table-no-mitochondria-no-chloroplast-blank-filtered-yellow-stripe-duplicate-pedigree-no-NCH1-group-by-location-from-rarefied_table-18000/",environment_factor,"_distance_matrix.qza",sep="")
    print(environment_factor_name)
    environment_factor_distance <- read_qza(environment_factor_name)
    environment_factor_matrix <- as.matrix(environment_factor_distance$data)
    
    #find the intersect between environment data location and genotype location 
    factor_location_names <- rownames(environment_factor_matrix)
    intersect_names <- intersect(location_names,factor_location_names)
    
    genotype_location_weighted_unifrac <- genotype_location_weighted_unifrac[c(intersect_names),c(intersect_names)]
    genotype_location_unweighted_unifrac <- genotype_location_unweighted_unifrac[c(intersect_names),c(intersect_names)]
    temp_G2F_location_environment_factor_matrix <- environment_factor_matrix[c(intersect_names),c(intersect_names)]
    
    #run mantel test 
    weighted_unifrac_location_distance_mantel <- mantel(temp_G2F_location_environment_factor_matrix,genotype_location_weighted_unifrac,method = "pearson",permutation=999)
    unweighted_unifrac_location_distance_mantel <- mantel(temp_G2F_location_environment_factor_matrix,genotype_location_unweighted_unifrac,method = "pearson",permutation=999)
    
    #if test result is significant (p < 0.05) add result to significant result list 
    print(weighted_unifrac_location_distance_mantel)
    if(weighted_unifrac_location_distance_mantel$signif > 0.05 | is.na(weighted_unifrac_location_distance_mantel$signif)){
      next
    }
    else{
      combination_name <- paste(genotype,environment_factor,sep="")
      significant_association_weighted_unifrac_genotype_environment <- append(significant_association_weighted_unifrac_genotype_environment,combination_name)
      
      weighted_unifrac_vector <- calc_distance_vector(as.data.frame(genotype_location_weighted_unifrac))
      environment_factor_vector <- calc_distance_vector(as.data.frame(temp_G2F_location_environment_factor_matrix))
      
      G2F_temp_data <- data.frame(weighted_unifrac_vector,environment_factor_vector)
      G2F_temp_data
      
      gg <- ggplot(data=G2F_temp_data, aes(x=environment_factor_vector, y=weighted_unifrac_vector)) + 
        geom_point(size=3,color="black") + scale_color_manual(values=c("cyan")) +
        xlab(environment_factor) +
        ylab("weighted_unifrac_distance") +  theme(
          legend.text = element_text(color = "black", size = 30),
          legend.title = element_text(color = "black", size = 30),
          axis.title.x = element_text(size=30, face="bold"),
          axis.title.y = element_text(size=30, face="bold"),
          plot.title = element_text(size=30, face="bold"),
          axis.text.y = element_text(size=30, face="bold"),
          axis.text.x = element_text(size=30, face="bold"),
        ) + geom_smooth(method="lm",color="blue") +
        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
              panel.background = element_blank(), axis.line = element_line(colour = "black")) +
        annotate("text",x = c(max(environment_factor_vector)*0.7,max(environment_factor_vector)*0.7),y = c(max(weighted_unifrac_vector)*0.9,max(weighted_unifrac_vector)*0.95),
                 label = c(paste("p-value = ",round(weighted_unifrac_location_distance_mantel$signif,3)),paste("R ^ 2 =",round(weighted_unifrac_location_distance_mantel$statistic,3))) ,size=10)
      
      print(gg)
      save_name = paste("./",environment_factor,"_",changed_genotype,"_weighted_unifrac.png",sep="") 
      save_name
      ggsave(save_name, height=10, width=10, device="png")
      
    }
    
    if(unweighted_unifrac_location_distance_mantel$signif > 0.05 | is.na(unweighted_unifrac_location_distance_mantel$signif)){
      next
    }
    else{
      combination_name <- paste(genotype,environment_factor,sep="")
      significant_association_unweighted_unifrac_genotype_environment <- append(significant_association_unweighted_unifrac_genotype_environment,combination_name)
      
      unweighted_unifrac_vector <- calc_distance_vector(as.data.frame(genotype_location_unweighted_unifrac))
      environment_factor_vector <- calc_distance_vector(as.data.frame(temp_G2F_location_environment_factor_matrix))
      
      G2F_temp_data <- data.frame(unweighted_unifrac_vector,environment_factor_vector)
      G2F_temp_data
      
      gg <- ggplot(data=G2F_temp_data, aes(x=environment_factor_vector, y=unweighted_unifrac_vector)) + 
        geom_point(size=3,color="black") + scale_color_manual(values=c("cyan")) +
        xlab(environment_factor) +
        ylab("unweighted_unifrac_distance") +  theme(
          legend.text = element_text(color = "black", size = 30),
          legend.title = element_text(color = "black", size = 30),
          axis.title.x = element_text(size=30, face="bold"),
          axis.title.y = element_text(size=30, face="bold"),
          plot.title = element_text(size=30, face="bold"),
          axis.text.y = element_text(size=30, face="bold"),
          axis.text.x = element_text(size=30, face="bold"),
        ) + geom_smooth(method="lm",color="blue") +
        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
              panel.background = element_blank(), axis.line = element_line(colour = "black")) +
        annotate("text",x = c(max(environment_factor_vector)*0.7,max(environment_factor_vector)*0.7),y = c(max(unweighted_unifrac_vector)*0.9,max(unweighted_unifrac_vector)*0.95),
                 label = c(paste("p-value = ",round(unweighted_unifrac_location_distance_mantel$signif,3)),paste("R ^ 2 =",round(unweighted_unifrac_location_distance_mantel$statistic,3))) ,size=10)
      
      print(gg)
      save_name = paste("./",environment_factor,"_",changed_genotype,"_unweighted_unifrac.png",sep="") 
      save_name
      ggsave(save_name, height=10, width=10, device="png")
      
    }
    
  }
}


significant_association_weighted_unifrac_genotype_environment

summary(G2F_metadata_2019_duplicate_pedigree_ys_filtered_selected_location$Corrected_pedigree)



##############################################################################################################

#instead of use kegg pathway, use the pathway abundance calculated based on MetaCYC
pathway_description <- read.table("./path_abun_unstrat_descrip.tsv",sep = "\t",header = T,check.names = T,row.names = 2)


# The name of sample was in this formate DEH.1.132 change . to - 
colnames(pathway_description) <- gsub(x = colnames(pathway_description), pattern = "\\.", replacement = "-")
#remove description column. since if this column was called to print on cosole, R studio will crash
pathway_description$pathway <- NULL
#keep pathway with 80% prevalence 
pathway_description <- pathway_description[rowSums(pathway_description == 0) <= 70, ]


#transpose the datafrane so that sample names are rownames 
input_pathway_table <- as.data.frame(t(pathway_description))
#create sample name 
input_pathway_table$SampleID <- rownames(input_pathway_table)
rownames(G2F_metadata_2019) <- gsub(x = rownames(G2F_metadata_2019), pattern = "\\.", replacement = "-")
G2F_metadata_2019$SampleID <- rownames(G2F_metadata_2019)
#merge with metadata 
input_pathway_table_metadata <- dplyr::inner_join(input_pathway_table,G2F_metadata_2019, by="SampleID")
rownames(input_pathway_table_metadata) <- input_pathway_table_metadata$SampleID

input_pathway_table_metadata$SampleID

colnames(input_pathway_table_metadata)


#average the pathway based on location 
average_pathway_location <- aggregate(x= input_pathway_table_metadata[,1:280],     
                                      
                                      # Specify group indicator
                                      by = list(input_pathway_table_metadata$location),      
                                      
                                      # Specify function (i.e. mean)
                                      FUN = mean)

#change the first column to location
colnames(average_pathway_location)[1] <- "location"


#add soil and weather metadata 
G2f_2019_soil_data <- read.csv("./g2f_2019_soil_data.csv",sep = ",")
G2f_2019_weather_data <- read.csv("./G2F_2019_weather_aervage_and_sum.tsv",sep = "\t")

#combine soil and weather data with pathway abudances 
G2F_average_alpha_soil <- dplyr::left_join(G2F_2019_median_alpha_diversity_by_location_data,G2f_2019_soil_data,by = c("location" = "Location"))
G2F_average_alpha_soil_weather <- dplyr::left_join(G2F_average_alpha_soil,G2f_2019_weather_data,by = c("location" = "location"))


#bind with environmental data 
average_pathway_location_metadata <- dplyr::inner_join(average_pathway_location,G2F_average_alpha_soil_weather,by = "location")
average_pathway_location_metadata
average_pathway_location_metadata$Texture <- NULL
average_pathway_location_metadata$Texture.No <- NULL

colnames(average_pathway_location_metadata)
#select pathway section
average_pathway_group <- average_pathway_location_metadata[,2:281]
colnames(average_pathway_group)
#select environmental
colnames(average_pathway_location_metadata)
pathway_location_metadata <- average_pathway_location_metadata[,c(290,292:301,307,311:313,317:319)]
colnames(pathway_location_metadata)

rownames(average_pathway_group) <- average_pathway_location_metadata$location
rownames(pathway_location_metadata ) <- average_pathway_location_metadata$location


library(Maaslin2)


significant_result <- data.frame(matrix(c(NA), nrow=1, ncol=9))

for (environmental_name in colnames(pathway_location_metadata)){
  #create the save path for result directory 
  save_path = paste("./lm_ec_pathway_Massalin2_no_normalize_no_transform_",environmental_name,"_test",sep="")
  print(save_path)
  #create the result 
  significant_result_path <- paste(save_path,"/significant_results.tsv",sep="")
  print(significant_result_path)
  fit_data <- Maaslin2(
    average_pathway_group,pathway_location_metadata,save_path,transform = "None",
    fixed_effects = c(environmental_name),min_prevalence = 0.9,max_significance = 0.1,
    standardize = TRUE)
  
  #read in the significant result file 
  temp_significant_result <- read.table(significant_result_path,sep="\t",header = T)
  colnames(significant_result) <- colnames(temp_significant_result)
  #change the data class of first three column so that result can combine with empty file 
  temp_significant_result$metadata <- as.character(temp_significant_result$metadata)
  temp_significant_result$feature <- as.character(temp_significant_result$feature)
  temp_significant_result$value <- as.character(temp_significant_result$value)
  
  significant_result <- dplyr::bind_rows(significant_result,temp_significant_result)
  
}

#significant_result$feature <-gsub("\\."," ",significant_result$feature)

significant_result_0.05 <- subset(significant_result,qval <=0.05)
significant_result_0.01  <- subset(significant_result,qval <=0.01)

#write.table(significant_result_0.05,"/home/hl46161/new_G2F_dada2/massalin2/combined_significant_result_rerun.tsv",row.names = F,sep="\t")

#add pathway desciption to raw pathway 
pathway_description <- read.table("./path_abun_unstrat_descrip.tsv",sep = "\t",header = T,check.names = T)
pathway_description <- pathway_description[,c("pathway","description")]


#keep only result have adjusted p value lower than 0.05

significant_result_0.05_annotation <- dplyr::left_join(significant_result_0.05,pathway_description, by = c("feature" = "description"))
significant_result_0.05_annotation <- subset(significant_result_0.05_annotation,value %in% c("Relative.Humidity....","Potassium.ppm.K"))

colnames(significant_result_0.05_annotation)

significant_result_0.05_annotation <- significant_result_0.05_annotation %>%
  mutate(association =
           case_when(coef <= 0 ~ "negative",
                     coef > 0 ~ "positive",
           ))

significant_result_0.05_annotation <- significant_result_0.05_annotation %>%
  mutate(Factor =
           case_when(metadata == "Potassium.ppm.K" ~ "Potassium",
                     metadata == "Relative.Humidity...." ~ "Relative Humidity",
           ))


significant_result_0.05_annotation <- significant_result_0.05_annotation[,c("Factor","feature","association","pathway")]
significant_result_0.05_annotation

write.table(significant_result_0.05_annotation,"./combined_significant_result_rerun_final.tsv",row.names = F,sep="\t")


colnames(significant_result_0.05_annotation) <- c("Factor","Pathway","association","Pathway_ID")

#######################################################

#add general category information of pathway 
parent_pathway <- read.csv("/home/hl46161/Downloads/G2F_2019_pathway_rerun_sig_pathway(1).txt",sep = "\t")

#parent_pathway_name <-  separate(x, c("A", "B"))(parent_pathway$Ontology...all.ancestors.of.entity," // ")

significant_result_0.05_annotation_parent_pathway <- dplyr::left_join(significant_result_0.05_annotation,parent_pathway, by = c("Pathway_ID" = "pathway_id"))

write.table(significant_result_0.05_annotation_parent_pathway,"./combined_significant_result_rerun_final_parent_pathway.tsv",row.names = F,sep="\t")