Skip to content

Commit

Permalink
Парсинг патентов с fips.ru
Browse files Browse the repository at this point in the history
  • Loading branch information
iMissile committed Nov 8, 2016
1 parent f1930f1 commit b646c6b
Show file tree
Hide file tree
Showing 3 changed files with 82 additions and 1 deletion.
2 changes: 1 addition & 1 deletion 47 sitewhere/think_tank.R
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ stop()

# --------------------------------------------------------------------------------------------------------
req <- try({
curl_fetch_memory("1https://raw.githubusercontent.com/iot-rus/Moscow-Lab/master/weather.txt")
curl_fetch_memory("https://raw.githubusercontent.com/iot-rus/Moscow-Lab/master/weather.txt")
# status_code == 200
# class(try-error)
})
Expand Down
13 changes: 13 additions & 0 deletions 54 FIPS/54 FIPS.Rproj
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
Version: 1.0

RestoreWorkspace: Default
SaveWorkspace: Default
AlwaysSaveHistory: Default

EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: CP1251

RnwWeave: Sweave
LaTeX: pdfLaTeX
68 changes: 68 additions & 0 deletions 54 FIPS/get_all_patents.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
library(ggplot2) #load first! (Wickham)
library(lubridate) #load second!
library(dplyr)
library(tidyr)
library(readr)
library(stringr)
library(jsonlite)
library(magrittr)
library(curl)
library(httr)
library(jsonlite)
library(xml2)
library(rvest)
library(iterators)
library(foreach)
library(doParallel)
library(future)
library(microbenchmark)
library(futile.logger)

flog.appender(appender.file("FIPS.log"))
flog.threshold(TRACE)
flog.info("============= Parsing started ===============")


# ìàãè÷åñêèé url äëÿ ñ÷èòûâàíèÿ ÷àñòè (50 øò) äîêóìåíòîâ ïî ÌÏÊ = G05B âçÿòà èç äåáàãà â áðàóçåðå è âûãëÿäèò ñëåäóþùèì îáðàçîì:
# http://www1.fips.ru/wps/portal/!ut/p/c5/jY7LDoIwFES_hS-4l2dhWYhpC4hgYhA2pCENYngYVBZ-vbByJTqznJyZgRIWD3JuG_lox0F2cIbSqYSgKY-Yjij2LlIvC0Pqu8h2-pIXThUwyi0SI7LkGKCwfMvgzDdQmP_Q-EUUf9D5-nZ7fc03-hM-9goKKMhn55AQD2lsRzpPXJN5NuSTuo_PqVaQ1bK-qFjNqktlo-DWn854JS9KNe0NbQg1dw!!/?beanMethod=doRestoreQuery&queryId=2737608&doSearch=true&pageNumber=2&selectedDBs=RUPATABRU%3BRUPATAP%3BRUPAT_NEW%3BRUPMAB%3BRUPM_NEW%3BIMPIN&fromUserId=514
# ãäå pagenumber ìåíÿåòñÿ îò 0 äî 74 (âñåãî äîêóìåíòîâ 3743)
# â îòâåò ïîëó÷àåì json, ó êîòîðîãî â ïîëå result -> hitlist ëåæèò html ñî ñïèñêîì áëîêà äîêóìåíòîâ
# http://www.jsoneditoronline.org/
# http://codebeautify.org/jsonviewer

req_str1 <- "http://www1.fips.ru/wps/portal/!ut/p/c5/jY7LDoIwFES_hS-4l2dhWYhpC4hgYhA2pCENYngYVBZ-vbByJTqznJyZgRIWD3JuG_lox0F2cIbSqYSgKY-Yjij2LlIvC0Pqu8h2-pIXThUwyi0SI7LkGKCwfMvgzDdQmP_Q-EUUf9D5-nZ7fc03-hM-9goKKMhn55AQD2lsRzpPXJN5NuSTuo_PqVaQ1bK-qFjNqktlo-DWn854JS9KNe0NbQg1dw!!/?beanMethod=doRestoreQuery&queryId=2737608&doSearch=true&pageNumber="
req_str2 <- "&selectedDBs=RUPATABRU%3BRUPATAP%3BRUPAT_NEW%3BRUPMAB%3BRUPM_NEW%3BIMPIN&fromUserId=514"

all_docs <-
foreach(n = iter(0:0), .packages = 'futile.logger', .combine = "c") %do% {
url <- str_c(req_str1, n, req_str2, collapse = "")
resp <- try(curl_fetch_memory(url))

# ïðîâåðèì òîëüêî 1-ûé ýëåìåíò êëàññà, ïîñêëüêó ïðè ðàçíûõ îòâåòàõ ïîëó÷àåòñÿ ðàçíîå êîë-âî ýëåìåíòîâ
if (class(resp)[[1]] == "try-error" || resp$status_code != 200) {
# http://stackoverflow.com/questions/15595478/how-to-get-the-name-of-the-calling-function-inside-the-called-routine
flog.error(paste0("Error in ", calledFun, " called from ", callingFun, ". Class(resp) = ", class(resp)))
flog.error(paste0("resp = ", resp))
# ñèãíàëèçèðóåì î íåâîçìîæíîñòè îáíîâèòü äàííûå
elem <- NULL
} else {
# ïðîâîäèì îáðàáîòêó êîíòåíòà
flog.info(paste0("Parsing page ", n))
htext <- fromJSON(rawToChar(resp$content))

elem <- htext$result$SearchResult$hitList
m <- read_html(elem, options=c("RECOVER"))
# òåïåðü ñîáèðàåì âñå ïî ÷àñòÿì â data.frame
dvIndex <- html_nodes(m, xpath="//div[@class='dvIndex']") %>%
magrittr::extract(-1) %>% # óäàëèëè ïåðâóþ ñòðîêó, òàì çàãîëîâîê òàáëèöû
html_text()
# iconv(from = "UTF8", to = "windows-1251")
# browser()
# m %>% html_nodes('div')
}

elem
}

write(all_docs, file="out.txt", append=FALSE)
flog.info("Output file generated")

0 comments on commit b646c6b

Please sign in to comment.