crawler-dicionarioinformal

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# crawler-dicionarioinformal v1.0
# Author: mnemonic
# date DDMMYY: 12/10/2020
# date 06/03/2023: v1.2 - updated to work with python3
# based on cewl from Sam (alex@0xdeadcode.se) - http://0xdeadcode.se
# dependences: python3, python-lxml

import threading, queue, urllib.request, urllib.error, urllib.parse, io, re, sys, os, optparse, inspect, signal
from string import ascii_lowercase
from lxml import html
import importlib
importlib.reload(sys)
# sys.setdefaultencoding("latin-1")

firstlayerqueue = queue.Queue()
wordqueue = queue.Queue()

class Crawl(threading.Thread):
    def __init__(self, firstlayerqueue, wordqueue):
        threading.Thread.__init__(self)
        self.firstlayerqueue = firstlayerqueue
        self.wordqueue = wordqueue

    def run(self):
        self.url = self.firstlayerqueue.get()
        self.success = False
        self.url = self.url + '/'
#       print 'IN THREAD: ' + self.url
        while not self.success:
            try:
                self.req = urllib.request.Request(self.url, headers={'User-Agent' : "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0"}) # :)
                self.con = urllib.request.urlopen(self.req)
                self.data = self.con.read()
                print('THREAD: ' + self.url + ' ( Ok )')
                self.success = True
            except:
                print('THREAD: ' + self.url + ' ( read error ) - retrying...')
                self.url = self.url.lower()
                self.success = False
        self.data = self.getContent(self.data)
        self.wordqueue.put(self.data)
        self.firstlayerqueue.task_done()

    def getContent(self, data):
        self.content = []
        self.data = data
        self.skip = True
        self.tree = html.fromstring(self.data)
        self.content = self.tree.xpath('//a[@class="popup-ajax"]/text()')
#       print self.content
        return list(set(self.content))

def writeWords():
    global outputfile, words, wordqueue
    data = wordqueue.get()
    f = open(outputfile, 'a')
    while not wordqueue.empty():
        for line in data:
            try:
                #line_encoded = line.encode('ISO-8859-1')
                #line_encoded = line.encode('UTF-8') # might want to uncomment $
                f.write(line.lower() + '\n')
                words += 1
            except:
                continue
        data = wordqueue.get()
    f.close()

##################
def handler(signum, frame): # http://stackoverflow.com/questions/1112343/how-do-i-capture-sigint-in-python
    global words, outputfile
    if not wordqueue.empty():
        print('\nHold on cowboy, let me finish the running threads and dump the words into %s' % outputfile)
        writeWords()
        print('Done. Wrote %i words into %s' % (words, outputfile))
            
    quit()
signal.signal(signal.SIGINT, handler)
###################


filename = os.path.split(inspect.getfile(inspect.currentframe()))
parser = optparse.OptionParser(filename[1] + ' <args>\n\n' +
         'Dicionario Informal Wordlist Generator by mnemonic\n\n' +
         'Example: python ' + filename[1] + ' -o wordlist.txt -t 5\n\n' +
         'ctrl+c to break\n\nI suggest doing something like this to clean the wordlist from duplicates:' +
         ' sort -u wordlist.txt >> n_wordlist.txt')
parser.add_option('-t', dest='nrthreads', type='int', help='Amount of threads')
parser.add_option('-o', dest='outputfile', type='string', help='File to write output to')
(options, args) = parser.parse_args()
nrthreads = options.nrthreads
starturl = 'http://www.dicionarioinformal.com.br/letra/'
outputfile = options.outputfile

if starturl == None or outputfile == None or nrthreads == None:
    print(parser.print_help())
    quit(0)

words = 0
for char in ascii_lowercase:
    char = char.upper()
    url = starturl + char + '/'
    req = urllib.request.Request(url, headers={'User-Agent' : "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0"}) # :)
    success = False
    while not success:
        try:
            print('Reading number of pages on ' + starturl + char, end = '')
            con = urllib.request.urlopen(req)
            data = con.read()
            data = data.decode('ISO-8859-1')  # encoding may vary!
            pages = re.search(r'<p>(\d+) páginas - (\d+) Definições</p>', data, re.UNICODE)
            if pages:
                for page in range(1,int(pages.group(1))+1):
                    firstlayerqueue.put(starturl + char + '/' + str(page))
#                   print starturl + char + '/' + str(page)
                print(' ( ' + pages.group(1) + ' )')
                success = True
        except urllib.error.URLError:
            print(' ( read error ) - retrying...')
            success = False

while 1: # generate first crawl content
    thread = Crawl(firstlayerqueue, wordqueue)
    thread.daemon = True
    thread.start()
    if thread.is_alive():
        break

int_count = 0
while words > -1:
    if firstlayerqueue.empty():
        writeWords()
        print('\nWrote %i words to %s. Queue empty.' % (words, outputfile))
        words = -1

    if not firstlayerqueue.empty():
        alivethread = 0
        for i in range(nrthreads):
            if not firstlayerqueue.empty():
                alivethread += 1
                thread = Crawl(firstlayerqueue, wordqueue)
                thread.daemon = True
                thread.start()
        for i in range(alivethread):
            thread.join(5)
        int_count += 1
        if int_count == 2:
            print('Joined %i threads. Queue size: %i' % (alivethread, firstlayerqueue.qsize()))
            int_count = 0
        continue