spider_from_mangabz.py

import urllib.requestimport randomimport bs4import reimport osimport timeimport mangabzimport socketimport urllib#设置超时时间为20ssocket.setdefaulttimeout(20)#解决下载不完全问题且避免陷入死循环"""下载图片的代码借鉴于https://blog.csdn.net/jclian91/article/details/77513289解决了爬虫时假死的情况，万分感谢"""def getHead():  #随机分配一个UA    my_headers = [        "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36",        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36",        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0",        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14",        "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)"    ]    randdom_header = random.choice(my_headers)    return randdom_headerdef askUrl(url):  #将html页面以文本形式返回    req = urllib.request.Request(url)    req.add_header("User-Agent", getHead())    req.add_header("GET", url)    response = urllib.request.urlopen(req)    html = response.read().decode('utf-8')    return htmldef GetInfoFromHtml(html):  #从主页面中分析出每章节的连接以及页数、名字，并保存下来，同时创建文件夹    soup = bs4.BeautifulSoup(html, "html.parser")    url = []    name = []    page = []    num = 0;    comic_soup = str(soup.find('p',attrs={"class": "detail-info-title"}))    comic_name = re.findall(findComicName,comic_soup)    comic_name = re.sub('[\/:*?"<>|\s]', '', str(comic_name[0])) #过滤不能用作文件夹名字的特殊符号和影响观感的空格    introduction_soup = str(soup.find('p',attrs={"class": "detail-info-content"}))    introduction = (re.findall(findComicIntroduction, introduction_soup))    if not os.path.exists(comic_name):        os.mkdir(comic_name)        os.chdir(comic_name)        print("创建文件夹...完成")        with open('作品简介.txt', 'w') as file_handle:  # .txt可以不自己新建,代码会自动新建            file_handle.write(introduction[0])  # 写入            file_handle.close()            print("写入简介...完成")    else:        os.chdir(comic_name)  #同层下创建漫画名文件夹并进入        print("进入已存在文件夹")        with open('作品简介.txt', 'a') as file_handle:  # .txt可以不自己新建,代码会自动新建            file_handle.write(introduction[0])  # 写入            file_handle.close()            print("写入简介...完成")    for item in soup.find_all(attrs={"class": "detail-list-form-item"}):        a = (re.findall(findEntrance, str(item)))        url.append(str(baseurl2 + a[0][0] + '/'))        name.append(str(a[0][1]))        page.append(int(a[0][2]))        num += 1    return (url, name, page, num)def CreatDireAndReadyDown(url, name, page, num):  #创建每章节文件夹并准备下载    for i in range(0,num):      #如果下载过程出错，可以从此处开始手动调整开始和结束位置        print(url[i], "目标 %s" % name[i],"共%d张图" % page[i], "模拟js搜集图片地址...")        path = re.sub('[\/:*?"<>|]', '-', name[i])        if not os.path.exists(path):            os.mkdir(path)            os.chdir(path)        else:            os.chdir(path)        dealMainUrl(url[i], page[i])        time.sleep(2)        print("%s 下载完毕" % name[i])        os.chdir(os.path.pardir)def dealMainUrl(mainUrl, page):  #先将图片链接计算出来保存到数组里，在对每个链接进行下载    mangabz.urllist = []    man = mangabz.Mangabz(mainUrl)    man.run()    print("图片网址加载完毕，开始下载...")    num = 1    for i in range(0,page):        url = mainUrl        opener = urllib.request.build_opener()        opener.addheaders = [('User-Agent', getHead()), ('Referer', mainUrl)]        urllib.request.install_opener(opener)        # urllib.request.urlretrieve(mangabz.urllist[i], "%d.jpg"%num,timeout=5)   #这行代码会遇到假死状态        try:            urllib.request.urlretrieve(mangabz.urllist[i], "%d.jpg"%num)        except socket.timeout:            count = 1            while count <= 5:                try:                    urllib.request.urlretrieve(mangabz.urllist[i], "%d.jpg"%num)                    break                except socket.timeout:                    err_info = 'Reloading for %d time' % count if count == 1 else 'Reloading for %d times' % count                    print(err_info)                    count += 1            if count > 5:                print("download job failed!")        print(str(num) + " %s 下载完毕"%(mangabz.urllist[i]))        num = num + 1if __name__ == '__main__':    findEntrance = re.compile(r'<a class="detail-list-form-item.*href="(.*?)/".*>(.*?) .*<span>（(.*?)P）</span>.*')    findComicName = re.compile(r'<p class="detail-info-title">(.*?)</p>')    findComicIntroduction = re.compile(r'<p class="detail-info-content">(.*?)</p>')    baseurl =  input("Input the URL you want to download(such as http://www.mangabz.com/577bz/)\nPay attention to the ending '/' : \n")    baseurl2 = "http://www.mangabz.com"    html = askUrl(baseurl)    url, name, page, num = GetInfoFromHtml(html)    CreatDireAndReadyDown(url,name,page,num)