Skip to content
This repository has been archived by the owner on Dec 17, 2018. It is now read-only.

Commit

Permalink
features: change charts
Browse files Browse the repository at this point in the history
  • Loading branch information
will4906 committed Mar 7, 2018
1 parent 501afb1 commit 2f1f0ec
Show file tree
Hide file tree
Showing 16 changed files with 100 additions and 206 deletions.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@

### Data Visualization

![地图](images/demo1.png)
![地图](images/demo2.png)

![柱状图](images/demo1.png)

### License

Expand Down
Binary file added captcha.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
8 changes: 2 additions & 6 deletions config/base_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,10 @@
DATABASE_NAME = os.path.join(OUTPUT_GROUP_PATH, 'Patent.db')
# 生成excel地址
EXCEL_NAME = os.path.join(OUTPUT_GROUP_PATH, '专利.xlsx')
# 生成图表地址
DIAGRAM_NAME = os.path.join(OUTPUT_GROUP_PATH, 'diagram.html')
# 生成图表目录
CHARTS_NAME = os.path.join(OUTPUT_GROUP_PATH, 'charts.html')
# log文件名
LOG_FILENAME = os.path.join(OUTPUT_GROUP_PATH, "PatentCrawler.log")
# 模板文件目录,不建议修改
TEMPLATE_PATH = os.path.join(BASE_PATH, 'res', 'template')
# 模板文件地址,有可能增加和改变,不建议修改
TEMPLATE_NAME = os.path.join(TEMPLATE_PATH, 'template.html')
# 验证码模型地址
CAPTCHA_MODEL_NAME = os.path.join(BASE_PATH, 'res', 'captcha', 'sipoknn.job')

Expand Down
2 changes: 1 addition & 1 deletion config/config.ini
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ password=
[proxy]
;程序的代理决定使用https://github.com/jhao104/proxy_pool的代理池作为代理方式,若使用者有更好的方式请自行修改proxy_url

;是否使用代理
;是否使用代理[False, True]
use_proxy=True
;代理url,若use_proxy为False则忽略此项
proxy_url=http://127.0.0.1:5010/get
Expand Down
1 change: 1 addition & 0 deletions config/query_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from entity.query_item import SipoItem, DateSelect, And, ItemGroup, Or, Not

QUERY_LIST = [
# SipoItem(proposer='深圳大学', inventor='陈思平'),
SipoItem(abstract='人工智能'),
]

Expand Down
4 changes: 2 additions & 2 deletions crawler/spiders/patent.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from controller.url_config import *
from crawler.items import SipoCrawlerItem
from service.item_collection import resolve_data
from visual.map_charts import ChinaMap
from visual import create_charts

logger = Logger(__name__)

Expand Down Expand Up @@ -159,5 +159,5 @@ def parse_related_info(self, response):

def closed(self, reason):
if os.path.exists(DATABASE_NAME) and 'data' in OUTPUT_ITEMS and 'chart' in OUTPUT_ITEMS:
ChinaMap().create()
create_charts()
logger.info(reason)
Binary file modified images/demo1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/demo2.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
6 changes: 1 addition & 5 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
@author: will4906
"""
import configparser
import shutil
import os
import sys

Expand All @@ -15,7 +14,6 @@
from config import account_settings as account
from config import base_settings as base
from config.base_settings import *
# from visual.map_charts import ChinaMap
from entity.models import Patents
from service.log import init_log

Expand All @@ -28,15 +26,13 @@ def init_config():
base.check_proxy(cfg)
base.check_request(cfg)
base.check_output(cfg)
# print(base.TIMEOUT)


def init_base_path():
if os.path.exists(OUTPUT_PATH) is False:
os.mkdir(OUTPUT_PATH)
if os.path.exists(OUTPUT_GROUP_PATH) is False:
os.mkdir(OUTPUT_GROUP_PATH)
shutil.copy(TEMPLATE_NAME, DIAGRAM_NAME)


def init_data_base():
Expand All @@ -59,7 +55,7 @@ def init_data_base():
init_config()
init_base_path()
init_data_base()
# print(base.OUTPUT_ITEMS)

if 'log' in base.OUTPUT_ITEMS:
cmdline.execute(("scrapy crawl Patent -s LOG_FILE=" + LOG_FILENAME).split())
else:
Expand Down
8 changes: 8 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,17 @@ click==6.7
constantly==15.1.0
cryptography==2.1.4
cssselect==1.0.3
echarts-countries-pypkg==0.1.3
future==0.16.0
hyperlink==17.3.1
idna==2.6
incremental==17.5.0
Jinja2==2.10
jupyter-echarts-pypkg==0.0.11
lml==0.0.2
Logbook==1.2.1
lxml==4.1.1
MarkupSafe==1.0
numpy==1.14.0
parsel==1.3.1
peewee==3.0.11
Expand All @@ -23,6 +29,8 @@ pyasn1==0.4.2
pyasn1-modules==0.2.1
pycparser==2.18
PyDispatcher==2.0.5
pyecharts==0.3.3
pyecharts-jupyter-installer==0.0.3
pyOpenSSL==17.5.0
pypiwin32==220
queuelib==1.4.2
Expand Down
35 changes: 0 additions & 35 deletions res/echarts/echarts-all.js

This file was deleted.

20 changes: 0 additions & 20 deletions res/echarts/echarts.js

This file was deleted.

71 changes: 0 additions & 71 deletions res/template/template.html

This file was deleted.

4 changes: 2 additions & 2 deletions service/proxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

from logbook import Logger
from requests import ConnectTimeout, ReadTimeout
from requests.exceptions import ProxyError
from requests.exceptions import RequestException

import controller as ctrl
import requests
Expand Down Expand Up @@ -91,7 +91,7 @@ def wrapper(*args, **kwargs):
try:
resp = func(*args, **kwargs)
return resp
except (ReadTimeout, ConnectTimeout, ConnectionError, ProxyError):
except RequestException:
update_proxy()
raise Exception('函数重试5次,仍无法成功')
return wrapper
Expand Down
84 changes: 80 additions & 4 deletions visual/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,85 @@
Created on 2017/3/19
@author: will4906
绘图模块
"""
import os

from datetime import datetime
from logbook import Logger
from pyecharts import Map, Page, Bar

from config.base_settings import CHARTS_NAME
from entity.models import Patents


logger = Logger(__name__)


def create_charts():
page = Page()
page.add(create_map())
page.add(create_year_bar())
page.render(CHARTS_NAME)
logger.info("图表绘制完成")


def create_map():
province_list = [
'北京', '天津', '上海', '重庆', '河北', '河南', '云南', '辽宁', '黑龙江', '湖南', '安徽', '山东'
, '新疆', '江苏', '浙江', '江西', '湖北', '广西', '甘肃', '山西', '内蒙古', '陕西', '吉林', '福建'
, '贵州', '广东', '青海', '西藏', '四川', '宁夏', '海南', '台湾', '香港', '澳门'
]
value_list = []
max_value = 0
for i, province in enumerate(province_list):
province_counts = Patents.select().where(Patents.proposer_address ** ('%' + province + '%')).count()
if max_value < province_counts:
max_value = province_counts
value_list.append(province_counts)

map = Map("专利省份分布地图", width=1200, height=600)
map.add('', province_list, value_list, maptype='china', is_visualmap=True, is_label_show=True, visual_text_color='#000', visual_range=[0, max_value])
return map


def create_year_bar():
bar = Bar("专利年份分布", width=1200, height=600)

year_dicts = {}
for date in Patents.select(Patents.request_date).dicts():
if date.get('request_date', '') != '':
date_time = datetime.strptime(date.get('request_date', ''), '%Y.%m.%d')
year_patents = year_dicts.get(date_time.year, 0)
year_patents += 1
year_dicts.__setitem__(date_time.year, year_patents)
year_list = []
count_list = []
for year, counts in year_dicts.items():
year_list.append(year)

year_list.sort()
for year in year_list:
count_list.append(year_dicts.get(year))

bar.add('申请', year_list, count_list)

year_dicts = {}
for date in Patents.select(Patents.publish_date).dicts():
if date.get('publish_date', '') != '':
date_time = datetime.strptime(date.get('publish_date', ''), '%Y.%m.%d')
year_patents = year_dicts.get(date_time.year, 0)
year_patents += 1
year_dicts.__setitem__(date_time.year, year_patents)
year_list = []
count_list = []
for year, counts in year_dicts.items():
year_list.append(year)

year_list.sort()
for year in year_list:
count_list.append(year_dicts.get(year))

'''
图表内容引用的是echarts.all.min.js文件,该文件位于res\echarts文件夹下。
如果需要拷贝生成的图表文件请记得同时拷贝echarts.all.min.js文件,并修改引用路径。
'''
bar.add('公告', year_list, count_list)
return bar
59 changes: 0 additions & 59 deletions visual/map_charts.py

This file was deleted.

0 comments on commit 2f1f0ec

Please sign in to comment.