-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathScanAndDownload.py
88 lines (62 loc) · 1.63 KB
/
ScanAndDownload.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
from bs4 import BeautifulSoup
import urllib
import urllib.request
#For Extracting Url
import tldextract
from urllib.parse import urlparse
# from urlparse import urlparse # Python 2
a=[]
o=[]
o2=[]
mainArray=[]
k=[]
UrlList=[]
inputUrl=input("Enter Url:")
resp = urllib.request.urlopen(inputUrl)
#https://www.publicationprinters.com/clients.html
#http://ilp.mit.edu/webpub.jsp
#http://www.msrit.edu/department/ise.html
soup = BeautifulSoup(resp, from_encoding=resp.info().get_param('charset'))
for link in soup.find_all('a', href=True):
a.append(link['href'])
#print (type(a[0]))
for item in a:
if ".pdf" in item:
o.append(str(item))
print(o)
for item in o:
if not (("http") or ("https")) in item:
o2.append(str(item))
list = tldextract.extract(inputUrl)
domain_name = list.domain + '.' + list.suffix
MainUrl=list.subdomain+'.'+domain_name
#print(o)
for item in o2:
if (item[0]=="/"):
DownloadableUrl=MainUrl+item
UrlList.append(DownloadableUrl)
else:
DownloadableUrl=MainUrl+"/"+item
UrlList.append(DownloadableUrl)
#print (UrlList)
#Download From UrlList
k=1
for item in UrlList:
data=urllib.request.urlretrieve ("http://"+ item, str(k) + ".pdf")
print (item)
k=k+1
#parsed_uri = urlparse("http://ilp.mit.edu/webpub.jsp" )
#print (type(parsed_uri))
#resultUrl = "{uri.scheme}://{uri.netloc}/".format(uri=parsed_uri)
#result=str(resultUrl)
#print (result)
#print(type(result))
#q="http://ilp-www.mit.edu"
#print(type(q))
#for item in o:
# if result not in item:
# print(type(item))
# k.append(item)
#print (o)
#print ("\n\n\n")
#print (k)