-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathcrawler-dictionary
executable file
·147 lines (132 loc) · 4.87 KB
/
crawler-dictionary
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
# crawler-dictionary v1.0
# Author: mnemonic
# date: 27/02/2017
# based on cewl from Sam ([email protected]) - http://0xdeadcode.se
# dependences: python2, python2-lxml
import threading, Queue, urllib2, StringIO, re, sys, os, optparse, inspect, signal
from string import lowercase
from lxml import html
reload(sys)
sys.setdefaultencoding("latin-1")
firstlayerqueue = Queue.Queue()
wordqueue = Queue.Queue()
class Crawl(threading.Thread):
def __init__(self, firstlayerqueue, wordqueue):
threading.Thread.__init__(self)
self.firstlayerqueue = firstlayerqueue
self.wordqueue = wordqueue
def run(self):
self.url = self.firstlayerqueue.get()
self.success = False
# print 'IN THREAD: ' + self.url
while not self.success:
try:
self.req = urllib2.Request(self.url, headers={'User-Agent' : "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0"}) # :)
self.con = urllib2.urlopen(self.req)
self.data = self.con.read()
print 'THREAD: ' + self.url + ' ( Ok )'
self.success = True
except:
print 'THREAD: ' + self.url + ' ( read error ) - retrying...'
self.success = False
self.data = self.getContent(self.data)
self.wordqueue.put(self.data)
self.firstlayerqueue.task_done()
def getContent(self, data):
self.content = []
self.data = data
self.skip = True
self.tree = html.fromstring(self.data)
self.content = self.tree.xpath('//span[@class="word"]/text()')
# print self.content
return list(set(self.content))
def writeWords():
global outputfile, words, wordqueue
while 1:
data = wordqueue.get()
for line in data:
try:
#line_encoded = line.encode('ISO-8859-1')
line_encoded = line.encode('UTF-8') # might want to uncomment $
except:
continue
f = open(outputfile, 'a')
f.write(line_encoded.lower() + '\n')
f.close()
words += 1
if wordqueue.empty():
break
##################
def handler(signum, frame): # http://stackoverflow.com/questions/1112343/how-do-i-capture-sigint-in-python
global words, outputfile
if not wordqueue.empty():
print '\nHold on cowboy, let me finish the running threads and dump the words into %s' % outputfile
writeWords()
print 'Done. Wrote %i words into %s' % (words, outputfile)
quit()
signal.signal(signal.SIGINT, handler)
###################
filename = os.path.split(inspect.getfile(inspect.currentframe()))
parser = optparse.OptionParser(filename[1] + ' <args>\n\n' +
'Dictionary Wordlist Generator by mnemonic\n\n' +
'Example: python ' + filename[1] + ' -o wordlist.txt -t 5\n\n' +
'ctrl+c to break\n\nI suggest doing something like this to clean the wordlist from duplicates:' +
' sort -u wordlist.txt >> n_wordlist.txt')
parser.add_option('-t', dest='nrthreads', type='int', help='Amount of threads')
parser.add_option('-o', dest='outputfile', type='string', help='File to write output to')
(options, args) = parser.parse_args()
nrthreads = options.nrthreads
starturl = 'http://www.dictionary.com/list/'
outputfile = options.outputfile
if starturl == None or outputfile == None or nrthreads == None:
print parser.print_help()
quit(0)
words = 0
for char in lowercase:
url = starturl + char + '/1'
req = urllib2.Request(url, headers={'User-Agent' : "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0"}) # :)
success = False
while not success:
try:
print 'Reading number of pages on ' + url,
con = urllib2.urlopen(req)
data = con.read()
pages = re.search(ur'/(\d+)">Last ›</a>', data, re.UNICODE)
if pages:
for page in xrange(1,int(pages.group(1))+1):
firstlayerqueue.put(starturl + char + '/' + str(page))
# print starturl + char + '/' + str(page)
print ' ( ' + pages.group(1) + ' )'
success = True
except urllib2.URLError:
print ' ( read error ) - retrying...'
success = False
while 1: # generate first crawl content
thread = Crawl(firstlayerqueue, wordqueue)
thread.daemon = True
thread.start()
if thread.isAlive():
break
int_count = 0
while words > -1:
if firstlayerqueue.empty():
writeWords()
print '\nWrote %i words to %s. Queue empty.' % (words, outputfile)
words = -1
if not firstlayerqueue.empty():
alivethread = 0
for i in range(nrthreads):
if not firstlayerqueue.empty():
alivethread += 1
thread = Crawl(firstlayerqueue, wordqueue)
thread.daemon = True
thread.start()
for i in range(alivethread):
thread.join(5)
int_count += 1
if int_count == 2:
print 'Joined %i threads. Queue size: %i' % (alivethread, firstlayerqueue.qsize())
int_count = 0
continue