-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.py
109 lines (101 loc) · 4.11 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import json
import requests
from bs4 import BeautifulSoup
import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.header import Header
class Postman:
def __init__(self):
self.mail_host = "smtp.126.com"
self.mail_user = "cy0240"
self.mail_pass = "VLSHXPAOYNGSQTBK"
self.sender = "[email protected]"
# self.receivers = "[email protected],[email protected]"
def send(self, destinations):
message = MIMEMultipart()
text = MIMEText(
"The content of NTCE websites has been updated. Please check it out!"
+ "\nThe link is https://ntce.neea.edu.cn/ ."
+ "\n"
+ "\nIf any question about this crawler occurs, you could just reply to this e-mail. I will git in touch with you in a week, hopefully.",
"plain",
"utf-8",
)
message.attach(text)
message["From"] = "Eddie He<[email protected]>"
message["Subject"] = Header("Update Notification of NTCE Website", "utf-8")
message["To"] = "You"
try:
smtpObj = smtplib.SMTP()
smtpObj.connect(self.mail_host, 25)
smtpObj.login(self.mail_user, self.mail_pass)
smtpObj.sendmail(self.sender, destinations, message.as_string())
print(
"The e-mails have been sent to {} receivers.".format(len(destinations))
)
for destination in destinations:
print("The e-mails have been sent to {} .".format(destination))
smtpObj.quit()
except smtplib.SMTPException:
print("Error: can not send e-mails.")
class Spider:
def fetch(self):
url = "https://ntce.neea.edu.cn"
try:
r = requests.get(url)
r.encoding = r.apparent_encoding
r.raise_for_status()
soup = BeautifulSoup(r.text, "html.parser")
ksdt = soup.find(string="考试动态").parent.get("href")
print("url + ksdt: ", url + ksdt)
try:
r2 = requests.get(url + ksdt)
r2.encoding = r2.apparent_encoding
r2.raise_for_status()
soup2 = BeautifulSoup(r2.text, "html.parser")
listdiv = soup2.find("div", class_="listdiv")
first_data = listdiv.find("ul", id="first_data")
li = first_data.find_all("li")
first_ReportIDIssueTime = (
li[0].find("span", id="ReportIDIssueTime").string
)
print("first_ReportIDIssueTime from NTCE: ", first_ReportIDIssueTime)
return first_ReportIDIssueTime
except:
print(
"Error occurs when applying request.get(ksdt)! \nr2.status_code: ",
r.status_code,
)
except:
print(
"Error occurs when applying request.get(https://ntce.neea.edu.cn)! \nr.status_code: ",
r.status_code,
)
def verify(self, first_ReportIDIssueTime):
with open("./ReportIDIssueTime.json", "r") as f:
ReportIDIssueTime = json.load(f)
print(
"first_ReportIDIssueTime from json: ",
ReportIDIssueTime["first_ReportIDIssueTime"],
)
f.close()
if first_ReportIDIssueTime == ReportIDIssueTime["first_ReportIDIssueTime"]:
print("No update!")
return True
else:
with open("./ReportIDIssueTime.json", "w") as f2:
ReportIDIssueTime["first_ReportIDIssueTime"] = first_ReportIDIssueTime
json.dump(ReportIDIssueTime, f2)
f2.close()
print("Content updated!")
return False
if __name__ == "__main__":
spider = Spider()
first_ReportIDIssueTime = spider.fetch()
if spider.verify(first_ReportIDIssueTime) is not True:
with open("./receivers.json", "r") as file:
receivers = json.load(file)
file.close()
postman = Postman()
postman.send(receivers["addresses"])