Skip to content

Commit

Permalink
更新版本和一些实验报告
Browse files Browse the repository at this point in the history
  • Loading branch information
W1ndys committed Nov 6, 2024
1 parent 53efdf6 commit 018383c
Show file tree
Hide file tree
Showing 8 changed files with 343 additions and 16 deletions.
9 changes: 5 additions & 4 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
"server": "hexo server"
},
"hexo": {
"version": "7.1.1"
"version": "7.2.0"
},
"dependencies": {
"hexo": "^7.0.0",
Expand All @@ -29,7 +29,7 @@
"hexo-renderer-pug": "^3.0.0",
"hexo-renderer-stylus": "^3.0.1",
"hexo-server": "^3.0.0",
"hexo-theme-butterfly": "^4.13.0",
"hexo-theme-butterfly": "^5.2.2",
"hexo-theme-landscape": "^1.0.0",
"hexo-wordcount": "^6.0.1"
},
Expand Down
10 changes: 5 additions & 5 deletions pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

15 changes: 11 additions & 4 deletions source/_posts/0xGame2024wp.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,16 @@
---
title: "0xGame2024wp"
tags: [CTF, WP, 0xGame2024]
categories: [CTF, WP, 0xGame2024]
title: 0xGame2024wp
tags:
- CTF
- WP
- 0xGame2024
categories:
- CTF
- WP
- 0xGame2024
cover: 'https://ctf.njupt.edu.cn/wp-content/uploads/2020/09/2020092712314778.png'
abbrlink: 2fabe180
date: 2024-10-06 17:48:47
cover: https://ctf.njupt.edu.cn/wp-content/uploads/2020/09/2020092712314778.png
---

南邮的新生赛,`week1` 对于我这种老年痴呆来说还是能做一做。
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
---
title: 2024山东省网络安全技能大赛九州信泰杯wp
tags: []
categories: []
cover: 'https://t.mwm.moe/fj?20241027222243818'
abbrlink: 36d0dbb
date: 2024-10-27 22:22:43
---

Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
---
title: Python实验——曲阜师范大学新闻内容爬取
tags:
- Python
categories:
- Python
cover: >-
https://th.bing.com/th/id/OIP.dJToM1TiZiJA0GYwzDHwjQHaHY?w=179&h=180&c=7&r=0&o=5&pid=1.7
abbrlink: 6d67d75
date: 2024-11-06 08:59:32
---

## 心得体会(随便说几句)

核心就是利用 requests 和 BeautifulSoup 库,通过定位标签来获取新闻内容。

其实关键点就在于定位标签,通过观察网页源码,可以发现新闻内容和标题等都是在一个个的 `div` 标签中,因此我们可以通过 `find` 方法来定位这些标签,然后通过 `find_all` 方法来获取所有符合条件的标签。

灵活运用 `print` 语句,方便调试,慢慢就写出来了。

还有个问题就是,获取到的链接是形如 `../` 的相对链接,需要进行转换再进行拼接,可以用切片或者 `replace` 方法来实现。

还遇到一个问题就是,浏览器 F12 看到的标签和网页源码中的标签有时候会不一致,这里需要 `print` 调试才能找到正确的标签。

## 源码

```py
import requests
from bs4 import BeautifulSoup


def scrape_news(url):

response = requests.get(url)
response.encoding = "utf-8"

soup = BeautifulSoup(response.text, "html.parser")
# 定位body
body = soup.body
if body is not None:
# 定位div
div = body.find("div", class_="wrap")
if div is not None:
# 定位div
div_container = div.find("div", class_="container clearfix")
if div_container is not None:
# 定位li
div_pull_right = div_container.find(
"div", class_="pull-right list-right"
)
if div_pull_right is not None:
# 定位lb-ul
lb_ul = div_pull_right.find("ul", class_="lb-ul")
if lb_ul is not None:
# 定位li
li = lb_ul.find_all("li")
result = ""
for item in li:
title = item.find("div", class_="lb-ul-tt txt-elise").text
date = item.find("div", class_="lb-ul-date").text
content = item.find("div", class_="lb-ul-p").text
link = item.find("a", class_="clearfix img-hide").get(
"href"
)
link = (
f"https://www.qfnu.edu.cn{link[2:].replace("../", "")}"
)
result += f"标题: {title}\n日期: {date}\n内容: {content}\n链接: {link}\n\n"
return result
else:
print(f"未找到 class 为 'lb-ul' 的 ul,获取到的值是:{lb_ul}")
return
else:
print(
f"未找到 class 为 'pull-right list-right' 的 div,获取到的值是:{div_pull_right}"
)
return
else:
print(
f"未找到 class 为 'container clearfix' 的 div,获取到的值是:{div_container}"
)
return
else:
print(f"未找到 class 为 'wrap' 的 div,获取到的值是:{div}")
return
else:
print(f"未找到 body,获取到的值是:{body}")
return


# 获取新闻页数
def get_news_page_count(url):

response = requests.get(url)
response.encoding = "utf-8"

soup = BeautifulSoup(response.text, "html.parser")
# 定位body
body = soup.body
if body is not None:
# 定位div
div = body.find("div", class_="wrap")
if div is not None:
# 定位div
div_container = div.find("div", class_="container clearfix")
if div_container is not None:
# 定位div
div_pull_right = div_container.find(
"div", class_="pull-right list-right"
)
# print(div_pull_right)
if div_pull_right is not None:
# 定位div
div_page_box = div_pull_right.find(
"div", class_="page-box text-center wow fadeInUp"
)
if div_page_box is not None:
div_pb_sys_common = div_page_box.find(
"div", class_="pb_sys_common"
)
if div_pb_sys_common is not None:
span_p_pages = div_pb_sys_common.find(
"span", class_="p_pages"
)
if span_p_pages is not None:
span_p_no = span_p_pages.find("span", class_="p_no")
if span_p_no is not None:
a = span_p_no.find("a")
if a is not None:
return int(
a.get("href")
.replace(".htm", "")
.replace("xxyw/", "")
)
else:
print(
f"未找到 class 为 'pb_sys_common' 的 div,获取到的值是:{div_pb_sys_common}"
)
return 0
else:
print(
f"未找到 class 为 'page-box text-center wow fadeInUp animated' 的 div,获取到的值是:{div_page_box}"
)
return 0
else:
print(
f"未找到 class 为 'pull-right list-right' 的 div,获取到的值是:{div_pull_right}"
)
return 0
else:
print(
f"未找到 class 为 'container clearfix' 的 div,获取到的值是:{div_container}"
)
return 0
else:
print(f"未找到 class 为 'wrap' 的 div,获取到的值是:{div}")
return 0
else:
print(f"未找到 body,获取到的值是:{body}")
return 0


# 保存为txt
def save_to_txt(data, filename):
with open(filename, "w", encoding="utf-8") as f:
f.write(data)


print("即将开始爬取新闻首页")

news_index = scrape_news("https://www.qfnu.edu.cn/news/xxyw.htm")
save_to_txt(news_index, "news_index.txt")

print("新闻首页爬取完成")

print("即将开始爬取新闻第二页及以后")

page_count = get_news_page_count("https://www.qfnu.edu.cn/news/xxyw.htm")

if page_count:
print(f"获取到的新闻页数为:{page_count}")
else:
print("未获取到新闻页数")
exit()

if page_count > 0:
for i in range(page_count, 0, -1):
print(f"即将爬取第 {i}")
news = scrape_news(f"https://www.qfnu.edu.cn/news/xxyw/{i}.htm")
save_to_txt(news, f"news_{i}.txt")

```

## 实现截图

![img](https://picx.zhimg.com/80/v2-a63a76b8e5698fefb5b966cf7bc54598.png)

可以看到七百多页全爬完了,大约用了几分钟

![img](https://pic1.zhimg.com/80/v2-370198b0ab7a10f1a912259d4091df7d.png)

![img](https://picx.zhimg.com/80/v2-5ee870e74b65fd0a23a07bf4aabc3b08.png)

随便点开几个看看

我是以一页为一个单位爬的,如果要用的话,直接 Python 脚本合并起来即可
Loading

0 comments on commit 018383c

Please sign in to comment.