-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy paththreadtry.py
133 lines (118 loc) · 5.56 KB
/
threadtry.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import threading
import os
import sys
import urllib.request
import urllib.parse
import gzip
import json#处理知乎返回的数据
import sys
import time
#########################################################
def READZHIHU(def_NULL,def_HashID,def_ID,def_Begin,def_End,def_FileName,def_NoName):
global ThreadCount
global ErrorCount
ThreadCount+=1
time_run=time.time()#获取时间,设定超时时长
Run_Force_END=300#最长运行时间不得多于300s
#print("NULL = "+def_NULL)
#print("def_HashID= "+def_HashID)
#print("def_Begin= "+def_Begin )
#print("def_End= "+def_End )
'''print("def_FileName ="+ def_FileName)
print("def_NoName = "+def_NoName )'''
_xsrf ='70d29039e3e8481387e68ca8b5d24c55'#黄中华的浏览器标识符
hashid=def_HashID
save=''#保存全部数值
PostHeader={
'Host':' www.zhihu.com'
,'User-Agent':' Mozilla/5.0 (Windows NT 6.1; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0'
,'Accept':' */*'
,'Accept-Language':' zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3'
,'Accept-Encoding':' gzip, deflate'
,'Content-Type':' application/x-www-form-urlencoded; charset=UTF-8'
,'X-Requested-With':' XMLHttpRequest'
,'Cookie':' q_c1=084fdb2c2b2043cd927c0d05e53c30f0|1382849503000|1382849503000; __utma=51854390.1660221720.1382715035.1383140776.1383148468.21; __utmz=51854390.1383148468.21.8.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/people/gayscript; zata=zhihu.com.084fdb2c2b2043cd927c0d05e53c30f0.82107; _ga=GA1.2.1660221720.1382715035; __utmv=51854390.100-1|2=registration_date=20130127=1^3=entry_date=20130127=1; q_c0="NTc1Mjk3OTkxMmM1NzU1N2MzZGQ5ZTMzMzRmNWVlMDR8MW9xU3hPdDF4U29BQlc4Qg==|1382855898|fa049faa61650e88122b85ede1872ddc5492ec3d"; _xsrf=70d29039e3e8481387e68ca8b5d24c55; __utmc=51854390; zatb=zhihu.com; __utmb=51854390.2.10.1383148468'
,'Connection':' keep-alive'
,'Pragma':' no-cache'
,'Cache-Control':' no-cache'
}
k={'msg':[1,2,3,4,5,1,2,3,4,5]}
Begin=int(def_Begin)
End =int(def_End)
#print("Read "+def_ID+"'s Follower ing "+"Range From "+str(Begin)+" to "+str(End))
#print("Range From "+str(Begin)+" to "+str(End))
#print("File Name is "+str(def_FileName))
filecontent=['begin']
charstring=''
try:
while ( (Begin<End )and((time.time()-time_run)<Run_Force_END) ):#可能不起作用,Socket的时间可能不会记入总时间内,试试吧~
url = "http://www.zhihu.com/node/ProfileFollowersListV2"
date1 = 'method=next¶ms={"hash_id":"'+hashid+'","order_by":"created","offset":'+str(Begin)+'}'+'&_xsrf='+_xsrf
date1 = urllib.parse.quote(date1).replace("%3D","=").replace("%26","&")
date1 = date1.encode(encoding="utf-8")
#print("date1=")
#print(date1)
Post =urllib.request.Request(url,data=date1,headers=PostHeader)
f = urllib.request.urlopen(Post)
if f.info().get('Content-Encoding') == 'gzip':
k=gzip.decompress(f.read())
k=k.decode('gb18030')
k=json.loads(k)
filecontent=k.get('msg')[:]
charstring+="".join(filecontent)
else:
filecontent=""
f.close()
#print(filecontent)
#Begin每次至少会加10,所以不用担心
Begin+=len(k.get('msg'))
#print(str(Begin)+" people has been read")
except:
ErrorCount +=1
print("Error at "+def_ID+" part "+def_FileName+" good luck")
pass
save=charstring
#print(def_ID+"has readen")
file=open(def_FileName,'wb')
file.write(save.encode('gb18030'))
file.close()
if ((time.time()-time_run)>Run_Force_END) :
print("**************************************************")
#print(def_FileName+" has writen")
ThreadCount-=1
return
#print(ID+" 's Follower Part"++"has been read")
#########################################################
ThreadCount=0
ErrorCount =0
thread=[]
f =open("d:\pyRead.txt","r",encoding= 'gb18030');
lines = f.readlines(100000)
while lines:
for line in lines:
#print(line)
buffer=line.split("\n")
line="".join(buffer)+"##"
#print(line)
hang = line.split("$")
for content in hang:
paramter = content.split("##")
thread.append(threading.Thread(target=READZHIHU,args=paramter))
lines = f.readlines(100000)
i=0
tcountappend=len(thread)-50
print("Thread begin and all of thread number is "+str(len(thread)))
for k in thread:
i+=1
if ThreadCount <200:#知乎发回的信息包,按满载20人/包算的话也有26kb之大,但开在多线程平均也只能达到110kb/s的下载速度,亦即正常是能同时有六个线程并发读取,每秒读取120条信息。考虑到信息中的重复数非常高(300MB的内容最终被压缩至不足15MB)所以,很有必要增开线程,即使错误多些也无妨,重点是读完将服务器内容读完一遍,而不是完整的下载所有信息。#搽,居然只跟网速有关。。。换上有线网之后速度哗哗的。。。。。。#10M网速下可以开2000个线程,当然,会有各种超时退出,无线网下,开100个吧:)
k.start()
else:
while ThreadCount >=200:
time.sleep(5)
k.start()
if i%100==0 :
print(str(i)+" process has done")
print("All Error is"+str(ErrorCount))
while ThreadCount>1 :
print("Process nearly end,Now there are "+str(ThreadCount)+"thread running ")
time.sleep(5)