-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathParsing HTML.py
87 lines (54 loc) · 2.6 KB
/
Parsing HTML.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# One way to parse HTML is to use regular expressions to repeatedly search for and extract substrings that match a particular pattern
# <h1>The First Page</h1>
# <p>
# If you like, you can switch to the
# <1 href="http://dr-chuck.com/page2.htm">
# Second Page </a>
# </p>
# We can consult a well-formed regular expression to match and extract the link values.
import urllib.request, urllib.parse, urllib.error
import re
import ssl # The ssl library allows the program to access web sites that strictly enforce HTTPS
# Ignore SSL certificate errors
ctx=ssl.create_default_context ()
ctx.check_hostname=False
ctx.verify_mode=ssl.CERT_NONE
url=input ('Enter -')
html=urllib.request.urlopen (url, context=ctx).read () # The read method returns HTML source code as a bytes object, instead of returning an HTTP Response object
links=re.findall (b'href="(http[s]?://.*?)"', html) # The findall regular expression method will give us a list of all the strings that match our regular expression, returning only the link text between the ""
for link in links:
print (link.decode ())
# Parsing HTML using Beautiful Soup - if we only use regular expressions we might either miss some valid links or end up with bad data
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
# Ignore SSL certificate errors
ctx=ssl.create_default_context ()
ctx.check_hostname=False
ctx.verify_mode=ssl.CERT_NONE
url=input ('Enter-') # The program prompts for the web address.
html=urllib.request.urlopen (url, context=ctx).read()# The program reads the data.
soup=BeautifulSoup (html, 'html.parser') # The program passes the data to the BeautifulSoup parser
# Retrieve all of the anchor tags
tags=soup ('a')
for tag in tags:
print (tag.get ('href', None)) # All the anchor tags are retrieved and the href attribute is printed for each tag.
# Pull out various parts of each tag:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import ssl
# Ignore SSL certificate errors
ctx=ssl.create_default_context ()
ctx.check_hostname=False
ctx.verify_mode=ssl.CERT_NONE
url=input ('Enter-')
html=urlopen (url, context=ctx).read ()
soup=BeautifulSoup (html, "html.parser")
# Retrieve all of the anchor tags
tags=soup ('a')
for tag in tags:
# Look at the parts of a tag and pull out parts of each tag such as URL, Contents, or Attributes
print ('TAG:', tag)
print ('URL:', tag.get ('href', None))
print ('Contents:', tag.contents[0])
print ('Attrs:', tag.attrs)