forked from james-see/python-examples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
html2plaintext-example.py
35 lines (30 loc) · 1.27 KB
/
html2plaintext-example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
"""Example on how to get plaintext from html using python's beautiful soup."""
# Author: James Campbell
# Date: 2015 05 19
# Date Updated: 2 July 2019
from bs4 import BeautifulSoup
def cleanMe(html):
"""Clean html into text only for-real."""
soup = BeautifulSoup(html, "lxml") # create a new bs4 object from html
for script in soup(["script", "style"]): # remove all javascript & css
script.extract()
# get text
text = soup.get_text()
# break into lines and remove leading and trailing space on each
lines = (line.strip() for line in text.splitlines())
# break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
# drop blank lines
text = '\n'.join(chunk for chunk in chunks if chunk)
return text
testhtml = """
<!DOCTYPE HTML>\n<head>\n<title>THIS IS AN EXAMPLE by @jamescampbell</title>
\n<meta author='jamescampbell'>\n<style>a {font-family:arial;}</style>
\n</head><body>\n<h1>Hello World</h1>\n<p>I hope you enjoy this example.
</p></body>
"""
print('\n\n[*-*]Before html with text:\n------------------')
print(testhtml)
print('------------------\n\n\n\n[*-*]After cleanMe() function:\n------------')
print(cleanMe(testhtml))
print('-------------------\n\n')