Skip to content

Commit

Permalink
на 1256 городов -- 15 мин.
Browse files Browse the repository at this point in the history
  • Loading branch information
iMissile committed Aug 23, 2017
1 parent c6a9830 commit b5e0895
Showing 1 changed file with 2 additions and 201 deletions.
203 changes: 2 additions & 201 deletions 78 r_gis_2017/ya_encoding.R
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ flog.info("App started")
# ñ÷èòûâàåì òàáëèöó ãîðîäîâ
cities_df <- read_excel("./data/ãîðîäà.xlsx") %>%
select(city_name=`Ãîðîä (ðóñ)`) %>%
head(5) %>%
# head(5) %>%
# add_row(city_name="Àðçàìàñ", .before=1) %>%
# add_row(city_name="Íåçíàåìî ÷òî", .before=1) %>%
distinct()
Expand Down Expand Up @@ -141,6 +141,7 @@ geo_df <- c(geoGoogle, geoYandex, geoOSM) %>%
toc()

write_csv(geo_df, "cities_geodata.csv")
saveRDS(geo_df, "cities_geodata.rds")

# df <- geo_df %>% reshape2::recast(location_src ~ api + variable)

Expand Down Expand Up @@ -169,203 +170,3 @@ cities_df$city_name %>% processGeoEncoding("geoOSM")
m <- addr %>%
stri_replace_all_regex(pattern=c("(\\s+)", ","), replacement=c("+", ""), vectorize_all=FALSE)
# browser()

oldgeoYandex <- function(location, IsAddressFilter=TRUE){
flog.info(paste0("==> function geoYandex, location = ", location))

stopifnot(is.character(location))
loc <- location
if (IsAddressFilter == T) {
IsAddress <- FALSE
if (grepl(pattern = "\\bîêð[óã]*\\b", x = loc))
IsAddress <- TRUE
if (grepl(pattern = "\\bîáë[àñòü]*\\b", x = loc))
IsAddress <- TRUE
if (grepl(pattern = "\\bobl\\b", x = loc))
IsAddress <- TRUE
if (grepl(pattern = "\\bg\\b", x = loc))
IsAddress <- TRUE
if (grepl(pattern = "\\bã[îðîä]*\\b", x = loc))
IsAddress <- TRUE
if (grepl(pattern = "\\bêð[àé]*\\b", x = loc))
IsAddress <- TRUE
if (grepl(pattern = "\\bðåñ[ïóáëèêà]*\\b", x = loc))
IsAddress <- TRUE
if (grepl(pattern = "\\bóë[èöà]*\\b", x = loc))
IsAddress <- TRUE
if (grepl(pattern = "\\bul\\b", x = loc))
IsAddress <- TRUE
if (grepl(pattern = "\\bêîð[ïóñ]*\\b", x = loc))
IsAddress <- TRUE
if (grepl(pattern = "\\bêðï\\b", x = loc))
IsAddress <- TRUE
if (grepl(pattern = "\\bä[îì]*\\b", x = loc))
IsAddress <- TRUE
if (grepl(pattern = "\\bäåð[åâíÿ]*\\b", x = loc))
IsAddress <- TRUE
if (grepl(pattern = "\\bïð[ðîåçä]*\\b", x = loc))
IsAddress <- TRUE
if (grepl(pattern = "\\bïåð[åóëîê]*\\b", x = loc))
IsAddress <- TRUE
if (grepl(pattern = "\\bïð[îñïåêò]*\\b", x = loc))
IsAddress <- TRUE
if (grepl(pattern = "\\bpr\\b", x = loc))
IsAddress <- TRUE
if (grepl(pattern = "\\bð-í\\b", x = loc))
IsAddress <- TRUE
if (grepl(pattern = "\\bðàéîí\\b", x = loc))
IsAddress <- TRUE
if (grepl(pattern = "\\bñ[åëî]*\\b", x = loc))
IsAddress <- TRUE
if (grepl(pattern = "\\bï[îñåëîê]*\\b", x = loc))
IsAddress <- TRUE
if (grepl(pattern = "\\bìêð\\b", x = loc))
IsAddress <- TRUE
if (grepl(pattern = "\\bø[îññå]*\\b", x = loc))
IsAddress <- TRUE
if (grepl(pattern = "\\báóë[üâàð]*\\b", x = loc))
IsAddress <- TRUE
if (grepl(pattern = "\\bá-ð\\b", x = loc))
IsAddress <- TRUE
if (grepl(pattern = "\\b\\d{1,4}\\b", x = loc))
IsAddress <- TRUE
if (IsAddress == FALSE) {
return (
tibble(
request = loc,
AdminAreaName = "IsAddress==F",
LocalityName = "IsAddress==F",
precision = "IsAddress==F",
text = "IsAddress==F",
name = "IsAddress==F",
pos = NA,
lon = NA,
lat = NA
)
)
# break
}
}

# browser()
location <- gsub(",", "", location)
location <- gsub(" ", "+", location)
url_string <- paste("http://geocode-maps.yandex.ru/1.x/?geocode=",
location,
sep = "")
url_string <- URLencode(url_string)
xmlText <- paste(readLines(url_string), "\n", collapse = "")
data <- xmlParse(xmlText, asText = TRUE)
xml_data <- xmlToList(data)
AdminAreaName <-
xml_data$GeoObjectCollection$featureMember$GeoObject$metaDataProperty$GeocoderMetaData$AddressDetails$Country$AdministrativeArea$AdministrativeAreaName
LocalityName <-
xml_data$GeoObjectCollection$featureMember$GeoObject$metaDataProperty$GeocoderMetaData$AddressDetails$Country$AdministrativeArea$SubAdministrativeArea$Locality$LocalityName
precision <-
xml_data$GeoObjectCollection$featureMember$GeoObject$metaDataProperty$GeocoderMetaData$precision
text <-
xml_data$GeoObjectCollection$featureMember$GeoObject$metaDataProperty$GeocoderMetaData$text
name <- xml_data$GeoObjectCollection$featureMember$GeoObject$name
pos <- xml_data$GeoObjectCollection$featureMember$GeoObject$Point$pos
lon <-
as.numeric(gsub(
pattern = "(.+)\\s+(.+)",
replacement = "\\1",
x = pos
))
lat <-
as.numeric(gsub(
pattern = "(.+)\\s+(.+)",
replacement = "\\2",
x = pos
))
return (
tibble(
request = loc,
AdminAreaName = AdminAreaName,
LocalityName = LocalityName,
precision = precision,
text = text,
name = name,
pos = pos,
lon = lon,
lat = lat
)
)
}

stop()
#### Ïîïûòêà ðàñïàðàëåëèòü ####
# ïîäãîòîâêà ê ïàðàëëåëüíîìó çàïóñêó
gc()
nworkers <- detectCores() - 1
registerDoParallel(nworkers)
getDoParWorkers()

# ðåãèñòðèðóåì îòäåëüíûé ëîããåð íà èñïîëíèòåëÿ
# http://stackoverflow.com/questions/38828344/how-to-log-when-using-foreach-print-or-futile-logger
loginit <- function(logfile) flog.appender(appender.file(logfile))
foreach(input=rep(common_log_name, nworkers),
.packages='futile.logger') %do% loginit(input)


mtrace(geoYandex)
z <- Adr
system.time(
l <- foreach(
x = z,
.combine = list,
.multicombine = TRUE,
.packages = c("rjson", "XML", "tibble")
) %do% {
res <- geoYandex(x, IsAddressFilter = T)
flog.info(paste0("address = ", x, " result = ", capture.output(str(res))))
res
})
mtrace.off()
# îñâîáîæäàåì ïàðàëëåëü
registerDoSEQ() # http://stackoverflow.com/questions/25097729/un-register-a-doparallel-cluster


# ñïèñîê òåñòîâûõ àäðåñîâ ----------------
addr <-
c(
"áðÿíñêàÿ îáëàñòü, ã. äÿòüêîâî, óë. ëåíèíà, ä. 166",
"áðÿíñêàÿ îáëàñòü, ã. äÿòüêîâî, óë. ëåíèíà, ä. 166",
"ã.äÿòüêîâî, óë.ëåíèíà, ä.166",
"ã.äÿòüêîâî, óë.ëåíèíà, ä.166, áðÿíñêàÿ îáë.",
"ã.äÿòüêîâî, óë.ëåíèíà, ä.166 2",
"ã. äÿòüêîâî,óë. ëåíèíà, ä.166",
"ã.äÿòüêîâî, óë.ëåíèíà, ä.166",
"ã.äÿòüêîâî, óë.ëåíèíà, ä.166, áðÿíñêàÿ îáë.",
"ã.äÿòüêîâî, óë.ëåíèíà, ä.166 2",
"ã.ëîáíÿ, óë.ãîðüêîãî, ä.104",
"ã.êîòåëüíèêè, ìêð í ñèëèêàò, ä.29",
"ìîñêîâñêàÿ îáëàñòü, ëþáåðåöêèé ðàéîí, ã. êîòåëüíèêè, ìêð. ñèëèêàò, äîì 29",
"ìîñêîâñêàÿ îáëàñòü, ëþáåðåöêèé ðàéîí, ã. êîòåëüíèêè, ìêð. ñèëèêàò, äîì 29",
"ìî, ã.ëîáíÿ, óë.ãîðüêîãî, ä.104",
"ìîñêîâñêàÿ îáëàñòü, ëþáåðåöêèé ðàéîí, ïîñåëîê êîòåëüíèêè, ìêð í ñèëèêàò, ä.29",
"ã.ëîáíÿ, óë.ãîðüêîãî, ä.104",
"ã.ëîáíÿ, óë.ãîðüêîãî, ä.104",
"ã.ëîáíÿ, óë.ãîðüêîãî, ä.104",
"ã.êîòåëüíèêè, ìêð í ñèëèêàò, ä.29",
"ã.ìîñêâà, óë.ñóäîñòðîèòåëüíàÿ, ä.1",
"ã. ìîñêâà, ñóäîñòðîèòåëüíàÿ óëèöà, ä. 1",
"ã. ìîñêâà, óë. ñóäîñòðîèòåëüíàÿ, ä. 1",
"ã.ìîñêâà, óë. ñóäîñòðîèòåëüíàÿ, ä.1",
"ã.ìîñêâà, óë.ñóäîñòðîèòåëüíàÿ, ä.1",
"ã. ìîñêâà, óë. ñóäîñòðîèòåëüíàÿ, ä. 1",
"ã.ìîñêâà, óë.ñóäîñòðîèòåëüíàÿ, ä.1",
"ã. ìîñêâà, ñóäîñòðîèòåëüíàÿ óëèöà, ä. 1",
"ã.ìîñêâà, óë.ñóäîñòðîèòåëüíàÿ, ä.1",
"ìî, äìèòðîâñêèé ðàéîí, ä. ïîäîñèíêè",
"ìî, äìèòðîâñêèé ðàéîí, ä. ïîäîñèíêè",
"ï.ïîäîñèíêè, ã.ï.äìèòðîâ",
"ï.ïîäîñèíêè, ã.ï.äìèòðîâ",
"äìèòðîâñêèé ð í, ï.ïîäîñèíêè",
"äìèòðîâñêèé ð í, ä.ïîäîñèíêè, ã.ï.äìèòðîâ",
"äìèòðîâñêèé ð í, ï.ïîäîñèíêè",
"ìîñêîâñêàÿ îáëàñòü, äìèòðîâñêèé ð í, ï.ïîäîñèíêè",
"ïîäîëüñêèé ð í, ñ.âîðîíîâî, ìàãàçèí"
)

0 comments on commit b5e0895

Please sign in to comment.