-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathcrawler-dicionarioinformal
executable file
·152 lines (137 loc) · 5.6 KB
/
crawler-dicionarioinformal
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# crawler-dicionarioinformal v1.0
# Author: mnemonic
# date DDMMYY: 12/10/2020
# date 06/03/2023: v1.2 - updated to work with python3
# based on cewl from Sam ([email protected]) - http://0xdeadcode.se
# dependences: python3, python-lxml
import threading, queue, urllib.request, urllib.error, urllib.parse, io, re, sys, os, optparse, inspect, signal
from string import ascii_lowercase
from lxml import html
import importlib
importlib.reload(sys)
# sys.setdefaultencoding("latin-1")
firstlayerqueue = queue.Queue()
wordqueue = queue.Queue()
class Crawl(threading.Thread):
def __init__(self, firstlayerqueue, wordqueue):
threading.Thread.__init__(self)
self.firstlayerqueue = firstlayerqueue
self.wordqueue = wordqueue
def run(self):
self.url = self.firstlayerqueue.get()
self.success = False
self.url = self.url + '/'
# print 'IN THREAD: ' + self.url
while not self.success:
try:
self.req = urllib.request.Request(self.url, headers={'User-Agent' : "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0"}) # :)
self.con = urllib.request.urlopen(self.req)
self.data = self.con.read()
print('THREAD: ' + self.url + ' ( Ok )')
self.success = True
except:
print('THREAD: ' + self.url + ' ( read error ) - retrying...')
self.url = self.url.lower()
self.success = False
self.data = self.getContent(self.data)
self.wordqueue.put(self.data)
self.firstlayerqueue.task_done()
def getContent(self, data):
self.content = []
self.data = data
self.skip = True
self.tree = html.fromstring(self.data)
self.content = self.tree.xpath('//a[@class="popup-ajax"]/text()')
# print self.content
return list(set(self.content))
def writeWords():
global outputfile, words, wordqueue
data = wordqueue.get()
f = open(outputfile, 'a')
while not wordqueue.empty():
for line in data:
try:
#line_encoded = line.encode('ISO-8859-1')
#line_encoded = line.encode('UTF-8') # might want to uncomment $
f.write(line.lower() + '\n')
words += 1
except:
continue
data = wordqueue.get()
f.close()
##################
def handler(signum, frame): # http://stackoverflow.com/questions/1112343/how-do-i-capture-sigint-in-python
global words, outputfile
if not wordqueue.empty():
print('\nHold on cowboy, let me finish the running threads and dump the words into %s' % outputfile)
writeWords()
print('Done. Wrote %i words into %s' % (words, outputfile))
quit()
signal.signal(signal.SIGINT, handler)
###################
filename = os.path.split(inspect.getfile(inspect.currentframe()))
parser = optparse.OptionParser(filename[1] + ' <args>\n\n' +
'Dicionario Informal Wordlist Generator by mnemonic\n\n' +
'Example: python ' + filename[1] + ' -o wordlist.txt -t 5\n\n' +
'ctrl+c to break\n\nI suggest doing something like this to clean the wordlist from duplicates:' +
' sort -u wordlist.txt >> n_wordlist.txt')
parser.add_option('-t', dest='nrthreads', type='int', help='Amount of threads')
parser.add_option('-o', dest='outputfile', type='string', help='File to write output to')
(options, args) = parser.parse_args()
nrthreads = options.nrthreads
starturl = 'http://www.dicionarioinformal.com.br/letra/'
outputfile = options.outputfile
if starturl == None or outputfile == None or nrthreads == None:
print(parser.print_help())
quit(0)
words = 0
for char in ascii_lowercase:
char = char.upper()
url = starturl + char + '/'
req = urllib.request.Request(url, headers={'User-Agent' : "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0"}) # :)
success = False
while not success:
try:
print('Reading number of pages on ' + starturl + char, end = '')
con = urllib.request.urlopen(req)
data = con.read()
data = data.decode('ISO-8859-1') # encoding may vary!
pages = re.search(r'<p>(\d+) páginas - (\d+) Definições</p>', data, re.UNICODE)
if pages:
for page in range(1,int(pages.group(1))+1):
firstlayerqueue.put(starturl + char + '/' + str(page))
# print starturl + char + '/' + str(page)
print(' ( ' + pages.group(1) + ' )')
success = True
except urllib.error.URLError:
print(' ( read error ) - retrying...')
success = False
while 1: # generate first crawl content
thread = Crawl(firstlayerqueue, wordqueue)
thread.daemon = True
thread.start()
if thread.is_alive():
break
int_count = 0
while words > -1:
if firstlayerqueue.empty():
writeWords()
print('\nWrote %i words to %s. Queue empty.' % (words, outputfile))
words = -1
if not firstlayerqueue.empty():
alivethread = 0
for i in range(nrthreads):
if not firstlayerqueue.empty():
alivethread += 1
thread = Crawl(firstlayerqueue, wordqueue)
thread.daemon = True
thread.start()
for i in range(alivethread):
thread.join(5)
int_count += 1
if int_count == 2:
print('Joined %i threads. Queue size: %i' % (alivethread, firstlayerqueue.qsize()))
int_count = 0
continue