-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathspider_from_mangabz.py
128 lines (117 loc) · 5.42 KB
/
spider_from_mangabz.py
1
import urllib.requestimport randomimport bs4import reimport osimport timeimport mangabzimport socketimport urllib#设置超时时间为20ssocket.setdefaulttimeout(20)#解决下载不完全问题且避免陷入死循环"""下载图片的代码借鉴于https://blog.csdn.net/jclian91/article/details/77513289解决了爬虫时假死的情况,万分感谢"""def getHead(): #随机分配一个UA my_headers = [ "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14", "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)" ] randdom_header = random.choice(my_headers) return randdom_headerdef askUrl(url): #将html页面以文本形式返回 req = urllib.request.Request(url) req.add_header("User-Agent", getHead()) req.add_header("GET", url) response = urllib.request.urlopen(req) html = response.read().decode('utf-8') return htmldef GetInfoFromHtml(html): #从主页面中分析出每章节的连接以及页数、名字,并保存下来,同时创建文件夹 soup = bs4.BeautifulSoup(html, "html.parser") url = [] name = [] page = [] num = 0; comic_soup = str(soup.find('p',attrs={"class": "detail-info-title"})) comic_name = re.findall(findComicName,comic_soup) comic_name = re.sub('[\/:*?"<>|\s]', '', str(comic_name[0])) #过滤不能用作文件夹名字的特殊符号和影响观感的空格 introduction_soup = str(soup.find('p',attrs={"class": "detail-info-content"})) introduction = (re.findall(findComicIntroduction, introduction_soup)) if not os.path.exists(comic_name): os.mkdir(comic_name) os.chdir(comic_name) print("创建文件夹...完成") with open('作品简介.txt', 'w') as file_handle: # .txt可以不自己新建,代码会自动新建 file_handle.write(introduction[0]) # 写入 file_handle.close() print("写入简介...完成") else: os.chdir(comic_name) #同层下创建漫画名文件夹并进入 print("进入已存在文件夹") with open('作品简介.txt', 'a') as file_handle: # .txt可以不自己新建,代码会自动新建 file_handle.write(introduction[0]) # 写入 file_handle.close() print("写入简介...完成") for item in soup.find_all(attrs={"class": "detail-list-form-item"}): a = (re.findall(findEntrance, str(item))) url.append(str(baseurl2 + a[0][0] + '/')) name.append(str(a[0][1])) page.append(int(a[0][2])) num += 1 return (url, name, page, num)def CreatDireAndReadyDown(url, name, page, num): #创建每章节文件夹并准备下载 for i in range(0,num): #如果下载过程出错,可以从此处开始手动调整开始和结束位置 print(url[i], "目标 %s" % name[i],"共%d张图" % page[i], "模拟js搜集图片地址...") path = re.sub('[\/:*?"<>|]', '-', name[i]) if not os.path.exists(path): os.mkdir(path) os.chdir(path) else: os.chdir(path) dealMainUrl(url[i], page[i]) time.sleep(2) print("%s 下载完毕" % name[i]) os.chdir(os.path.pardir)def dealMainUrl(mainUrl, page): #先将图片链接计算出来保存到数组里,在对每个链接进行下载 mangabz.urllist = [] man = mangabz.Mangabz(mainUrl) man.run() print("图片网址加载完毕,开始下载...") num = 1 for i in range(0,page): url = mainUrl opener = urllib.request.build_opener() opener.addheaders = [('User-Agent', getHead()), ('Referer', mainUrl)] urllib.request.install_opener(opener) # urllib.request.urlretrieve(mangabz.urllist[i], "%d.jpg"%num,timeout=5) #这行代码会遇到假死状态 try: urllib.request.urlretrieve(mangabz.urllist[i], "%d.jpg"%num) except socket.timeout: count = 1 while count <= 5: try: urllib.request.urlretrieve(mangabz.urllist[i], "%d.jpg"%num) break except socket.timeout: err_info = 'Reloading for %d time' % count if count == 1 else 'Reloading for %d times' % count print(err_info) count += 1 if count > 5: print("download job failed!") print(str(num) + " %s 下载完毕"%(mangabz.urllist[i])) num = num + 1if __name__ == '__main__': findEntrance = re.compile(r'<a class="detail-list-form-item.*href="(.*?)/".*>(.*?) .*<span>((.*?)P)</span>.*') findComicName = re.compile(r'<p class="detail-info-title">(.*?)</p>') findComicIntroduction = re.compile(r'<p class="detail-info-content">(.*?)</p>') baseurl = input("Input the URL you want to download(such as http://www.mangabz.com/577bz/)\nPay attention to the ending '/' : \n") baseurl2 = "http://www.mangabz.com" html = askUrl(baseurl) url, name, page, num = GetInfoFromHtml(html) CreatDireAndReadyDown(url,name,page,num)