forked from makcyun/eastmoney_spider
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patheastmoney_crawler2.py
216 lines (184 loc) · 6.92 KB
/
eastmoney_crawler2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
"""
e.g: http://data.eastmoney.com/bbsj/201806/lrb.html
"""
import requests
import re
from multiprocessing import Pool
import json
import csv
import pandas as pd
import os
import time
# 设置文件保存在D盘eastmoney文件夹下
file_path = 'D:\\eastmoney'
if not os.path.exists(file_path):
os.mkdir(file_path)
os.chdir(file_path)
# 1 设置表格爬取时期
def set_table():
print('*' * 80)
print('\t\t\t\t东方财富网报表下载')
print('作者:高级农民工 2018.10.10')
print('--------------')
# 1 设置财务报表获取时期
year = int(float(input('请输入要查询的年份(四位数2007-2018):\n')))
# int表示取整,里面加float是因为输入的是str,直接int会报错,float则不会
# https://stackoverflow.com/questions/1841565/valueerror-invalid-literal-for-int-with-base-10
while (year < 2007 or year > 2018):
year = int(float(input('年份数值输入错误,请重新输入:\n')))
quarter = int(float(input('请输入小写数字季度(1:1季报,2-年中报,3:3季报,4-年报):\n')))
while (quarter < 1 or quarter > 4):
quarter = int(float(input('季度数值输入错误,请重新输入:\n')))
# 转换为所需的quarter 两种方法,2表示两位数,0表示不满2位用0补充,
# http://www.runoob.com/python/att-string-format.html
quarter = '{:02d}'.format(quarter * 3)
# quarter = '%02d' %(int(month)*3)
# 确定季度所对应的最后一天是30还是31号
if (quarter == '06') or (quarter == '09'):
day = 30
else:
day = 31
date = '{}-{}-{}' .format(year, quarter, day)
# print('date:', date) # 测试日期 ok
# 2 设置财务报表种类
tables = int(
input('请输入查询的报表种类对应的数字(1-业绩报表;2-业绩快报表:3-业绩预告表;4-预约披露时间表;5-资产负债表;6-利润表;7-现金流量表): \n'))
dict_tables = {1: '业绩报表', 2: '业绩快报表', 3: '业绩预告表',
4: '预约披露时间表', 5: '资产负债表', 6: '利润表', 7: '现金流量表'}
dict = {1: 'YJBB', 2: 'YJKB', 3: 'YJYG',
4: 'YYPL', 5: 'ZCFZB', 6: 'LRB', 7: 'XJLLB'}
category = dict[tables]
# js请求参数里的type,第1-4个表的前缀是'YJBB20_',后3个表是'CWBB_'
# 设置set_table()中的type、st、sr、filter参数
if tables == 1:
category_type = 'YJBB20_'
st = 'latestnoticedate'
sr = -1
filter = "(securitytypecode in ('058001001','058001002'))(reportdate=^%s^)" %(date)
elif tables == 2:
category_type = 'YJBB20_'
st = 'ldate'
sr = -1
filter = "(securitytypecode in ('058001001','058001002'))(rdate=^%s^)" %(date)
elif tables == 3:
category_type = 'YJBB20_'
st = 'ndate'
sr = -1
filter=" (IsLatest='T')(enddate=^2018-06-30^)"
elif tables == 4:
category_type = 'YJBB20_'
st = 'frdate'
sr = 1
filter = "(securitytypecode ='058001001')(reportdate=^%s^)" %(date)
else:
category_type = 'CWBB_'
st = 'noticedate'
sr = -1
filter = '(reportdate=^%s^)' % (date)
category_type = category_type + category
# print(category_type)
# 设置set_table()中的filter参数
yield{
'date':date,
'category':dict_tables[tables],
'category_type':category_type,
'st':st,
'sr':sr,
'filter':filter
}
# 2 设置表格爬取起始页数
def page_choose(page_all):
# 选择爬取页数范围
start_page = int(input('请输入下载起始页数:\n'))
nums = input('请输入要下载的页数,(若需下载全部则按回车):\n')
print('*' * 80)
# 判断输入的是数值还是回车空格
if nums.isdigit():
end_page = start_page + int(nums)
elif nums == '':
end_page = int(page_all.group(1))
else:
print('页数输入错误')
# 返回所需的起始页数,供后续程序调用
yield{
'start_page': start_page,
'end_page': end_page
}
# 3 表格正式爬取
def get_table(date, category_type,st,sr,filter,page):
# 参数设置
params = {
# 'type': 'CWBB_LRB',
'type': category_type, # 表格类型
'token': '70f12f2f4f091e459a279469fe49eca5',
'st': st,
'sr': sr,
'p': page,
'ps': 50, # 每页显示多少条信息
'js': 'var LFtlXDqn={pages:(tp),data: (x)}',
'filter': filter,
# 'rt': 51294261 可不用
}
url = 'http://dcfm.eastmoney.com/em_mutisvcexpandinterface/api/js/get?'
# print(url)
response = requests.get(url, params=params).text
# print(response)
# 确定页数
pat = re.compile('var.*?{pages:(\d+),data:.*?')
page_all = re.search(pat, response)
print(page_all.group(1)) # ok
# 提取{},json.loads出错
# pattern = re.compile('var.*?data: \[(.*)]}', re.S)
# 提取出list,可以使用json.dumps和json.loads
pattern = re.compile('var.*?data: (.*)}', re.S)
items = re.search(pattern, response)
# 等价于
# items = re.findall(pattern,response)
# print(items[0])
data = items.group(1)
data = json.loads(data)
# data = json.dumps(data,ensure_ascii=False)
return page_all, data,page
# 写入表头
# 方法1 借助csv包,最常用
def write_header(data,category):
with open('{}.csv' .format(category), 'a', encoding='utf_8_sig', newline='') as f:
headers = list(data[0].keys())
# print(headers) # 测试 ok
writer = csv.writer(f)
writer.writerow(headers)
def write_table(data,page,category):
print('\n正在下载第 %s 页表格' % page)
# 写入文件方法1
for d in data:
with open('{}.csv' .format(category), 'a', encoding='utf_8_sig', newline='') as f:
w = csv.writer(f)
w.writerow(d.values())
def main(date, category_type,st,sr,filter,page):
func = get_table(date, category_type,st,sr,filter,page)
data = func[1]
page = func[2]
write_table(data,page,category)
if __name__ == '__main__':
# 获取总页数,确定起始爬取页数
for i in set_table():
date = i.get('date')
category = i.get('category')
category_type = i.get('category_type')
st = i.get('st')
sr = i.get('sr')
filter = i.get('filter')
constant = get_table(date,category_type,st,sr,filter, 1)
page_all = constant[0]
for i in page_choose(page_all):
start_page = i.get('start_page')
end_page = i.get('end_page')
# 写入表头
write_header(constant[1],category)
start_time = time.time() # 下载开始时间
# 爬取表格主程序
for page in range(start_page, end_page):
main(date,category_type,st,sr,filter, page)
end_time = time.time() - start_time # 结束时间
print('下载完成')
print('下载用时: {:.1f} s' .format(end_time))