-
Notifications
You must be signed in to change notification settings - Fork 27
/
05.04 - Business Problem.R
executable file
·124 lines (100 loc) · 4.15 KB
/
05.04 - Business Problem.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
library(zipcode)
library(ggplot2)
library(dplyr)
library(RColorBrewer)
library(geosphere)
options(scipen = F)
set.seed(1033)
setwd("") # set to a path on your computer
ds <- read.csv(file = "https://data.ny.gov/api/views/9a8c-vfzj/rows.csv?accessType=DOWNLOAD")
# bring in zip code data to get lat long info
data(zipcode)
zipcode$city2 <- toupper(zipcode$city)
ds$zip2 <- clean.zipcodes(ds$Zip.Code)
ds <- merge(ds, zipcode, by.x = "zip2", by.y = "zip", all.x = T)
# cluster analysis
data.cl <- ds %>% select(x = longitude, y = latitude)
data.cl <- data.cl[!is.na(data.cl$x) & !is.na(data.cl$y),]
# baseline mileage
centers <- data.frame(y = 43.0481, x = -76.1474)
miles <- data.cl
miles$x.y <- -76.1474
miles$y.y <- 43.0481
names(miles)[1:2] <- c("x.x", "y.x")
mls <- c()
for(i in 1:nrow(miles)){
mls.temp <- round(as.numeric(distm(c(miles$x.x[i], miles$y.x[i]), c(miles$x.y[i], miles$y.y[i]), fun = distHaversine) * 0.000621371),0)
mls <- c(mls, mls.temp)
}
miles$miles <- mls
mx.dist1 <- max(miles$miles)
tot.mls1 <- sum(miles$miles, na.rm = T)
# baseline plot
ggplot(data.cl, aes(x = x, y = y)) + geom_point(data = data.cl, stat = "identity", show.legend = F) +
geom_point(data = centers, shape = 23, fill = "darkred", color="darkred", size = 5) + theme_bw() +
theme_bw() + xlab("longitude") + ylab("latitude") + ggtitle("Distribution Network - BASELINE")
# kmeans - TWO
store.cl <- kmeans(data.cl, 2)
data.cl$clust <- store.cl$cluster
centers <- data.frame(store.cl$centers)
centers$clust <- 1:nrow(centers)
miles <- merge(data.cl, centers, by.x = "clust", by.y = "clust")
mls <- c()
for(i in 1:nrow(miles)){
mls.temp <- round(as.numeric(distm(c(miles$x.x[i], miles$y.x[i]), c(miles$x.y[i], miles$y.y[i]), fun = distHaversine) * 0.000621371),0)
mls <- c(mls, mls.temp)
}
miles$miles <- mls
mx.dist2 <- max(miles$miles)
tot.mls2 <- sum(miles$miles, na.rm = T)
# kmeans - THREE
store.cl <- kmeans(data.cl, 3)
data.cl$clust <- store.cl$cluster
centers <- data.frame(store.cl$centers)
centers$clust <- 1:nrow(centers)
miles <- merge(data.cl, centers, by.x = "clust", by.y = "clust")
mls <- c()
for(i in 1:nrow(miles)){
mls.temp <- round(as.numeric(distm(c(miles$x.x[i], miles$y.x[i]), c(miles$x.y[i], miles$y.y[i]), fun = distHaversine) * 0.000621371),0)
mls <- c(mls, mls.temp)
}
miles$miles <- mls
mx.dist3 <- max(miles$miles)
tot.mls3 <- sum(miles$miles, na.rm = T)
# kmeans - FOUR
store.cl <- kmeans(data.cl, 4)
data.cl$clust <- store.cl$cluster
centers <- data.frame(store.cl$centers)
centers$clust <- 1:4
miles <- merge(data.cl, centers, by.x = "clust", by.y = "clust")
mls <- c()
for(i in 1:nrow(miles)){
mls.temp <- round(as.numeric(distm(c(miles$x.x[i], miles$y.x[i]), c(miles$x.y[i], miles$y.y[i]), fun = distHaversine) * 0.000621371),0)
mls <- c(mls, mls.temp)
}
miles$miles <- mls
mx.dist4 <- max(miles$miles)
tot.mls4 <- sum(miles$miles, na.rm = T)
# kmeans - FIVE
store.cl <- kmeans(data.cl, 5)
data.cl$clust <- store.cl$cluster
centers <- data.frame(store.cl$centers)
centers$clust <- 1:5
miles <- merge(data.cl, centers, by.x = "clust", by.y = "clust")
mls <- c()
for(i in 1:nrow(miles)){
mls.temp <- round(as.numeric(distm(c(miles$x.x[i], miles$y.x[i]), c(miles$x.y[i], miles$y.y[i]), fun = distHaversine) * 0.000621371),0)
mls <- c(mls, mls.temp)
}
miles$miles <- mls
mx.dist5 <- max(miles$miles)
tot.mls5 <- sum(miles$miles, na.rm = T)
# plot it
ggplot(data.cl, aes(x = x, y = y, colour = as.character(clust))) + geom_point(data = data.cl, stat = "identity", show.legend = F) +
geom_point(data = centers, shape = 23, fill = "darkred", color="darkred", size = 5) + theme_bw() +
xlab("longitude") + ylab("latitude") + ggtitle("Distribution Network - 5 Points") +
scale_color_brewer(palette = "Set1")
##scale_color_manual(values=c("#0c2c84", "#41b6c4", "#7fcdbb"))
# plot the k total mileage
choose.k <- data.frame(k = 1:5, tot = c(tot.mls1, tot.mls2, tot.mls3, tot.mls4, tot.mls5) / 1000000)
ggplot(choose.k, aes(x = k, y = tot)) + geom_bar(stat = "identity", fill = "#045a8d") + ylab("Total Miles (Millions)") + xlab("Number of distribution centers")