-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcenagro-code-final.R
135 lines (95 loc) · 3.62 KB
/
cenagro-code-final.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
##################################
# IV CENSO NACIONAL AGROPECUARIO 2012 - LECTURA DE DATOS
# INPUT : CENAGRO database
# OUTPUT : Clustering and plots
# @MarvinQuispeSedano
##################################
# Librerias
pkgs = c("data.table", "cluster", "dplyr", "ggplot2", "Rtsne")
# install.packages(pkgs)
lapply(pkgs, library, character.only = TRUE)
##################################
# Configurar el directorio de trabajo
setwd("C:/Users/Asus/Documents/R/lateblight_tests/cenagro")
wd_data <- list.files("C:/Users/Asus/Documents/R/lateblight_tests/cenagro/combined/",
pattern=".rds", full=TRUE)
# Importar la primera base de datos
df1 <- readRDS(wd_data[1])
setDT(df1)
# Importar la segunda base de datos
df2 <- readRDS(wd_data[3])
setDT(df2)
# Importar un .csv con el codigo de los cultivos
cc <- read.csv("code.csv",sep = ";", header = T)
# Hacer un merge entre las bases de datos 1 y 2 para las variables de interes
df_merge = merge(df1[,c(2:7,25:49,53:54)],
df2[,c(2:7,10:17)],
by=c("P001","P002","P003","P007X","P008", "NPRIN"))
# Eliminar archivos para liberar la memoria
rm(df1);rm(df2)
# Obtener los datos segun nuestros requerimientos
df_crop <- df_merge[df_merge$P024_03 == "2612"]
df_crop <- merge(df_crop, cc, by.x="P024_03", by.y="code")
rm(df_merge)
###############################
# Seleccionar las columnas que dispongan datos
colSums(is.na(df_crop))
df_crop2 <- df_crop[, !c("P027","P029_01", "P029_02", "P029_03")]
df_crop3 <- na.omit(df_crop2)
# Obtener una muestra de 10k datos y verificar significancia estadistica
sample_size = 10000
set.seed(1)
idxs = sample(1:nrow(df_crop3),sample_size,replace=F)
df_crop3 <- as.data.frame(df_crop3)
df_crop4 <- as.data.frame(df_crop3[idxs,])
pvalues = list()
for (col in names(df_crop3)) {
if (class(df_crop3[,col]) %in% c("numeric","integer")) {
# Numeric variable. Using Kolmogorov-Smirnov test
pvalues[[col]] = ks.test(df_crop4[[col]],df_crop3[[col]])$p.value
} else {
# Categorical variable. Using Pearson's Chi-square test
probs = table(df_crop3[[col]])/nrow(df_crop3)
pvalues[[col]] = chisq.test(table(df_crop4[[col]]),p=probs)$p.value
}
}
pvalues
# Obtenemos la distancia de gower
gower_dist <- daisy(df_crop4[,c(1:32,35:38)], metric = "gower")
# Buscamos de 2 a 8 clusters
sil_width <- c(NA)
for(i in 2:8){
pam_fit <- pam(gower_dist, diss = TRUE, k = i)
sil_width[i] <- pam_fit$silinfo$avg.width
}
plot(1:8, sil_width,
xlab = "Number of clusters",
ylab = "Silhouette Width")
lines(1:8, sil_width)
# Obtenemos resultados para 5 clusters
k <- 5
pam_fit <- pam(gower_dist, diss = TRUE, k)
pam_results <- df_crop4 %>%
mutate(cluster = pam_fit$clustering) %>%
group_by(cluster) %>%
do(the_summary = summary(.))
pam_results$the_summary
# Obtener SNE - BARNES
tsne_obj <- Rtsne(gower_dist, is_distance = TRUE)
tsne_data <- tsne_obj$Y %>%
data.frame() %>%
setNames(c("X", "Y")) %>%
mutate(cluster = factor(pam_fit$clustering))
ggplot(aes(x = X, y = Y), data = tsne_data) +
geom_point(aes(color = cluster))
#########################################################
# Hierarchical Clustering
hc <- hclust(gower_dist, "ward.D2")
plot(hc)
rect.hclust(hc , k = 5, border = 2:6)
abline(h = 5, col = 'red')
cut_avg <- cutree(hc, k = 5)
df_crop_cluster <- mutate(df_crop4, cluster = cut_avg)
ggplot(df_crop_cluster, aes(x=LONG_DECI, y = LAT_DECI, color = factor(cluster))) +
geom_point()
mod_plot <- ctree(cluster ~ .,data = df_crop_cluster)