-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 6a474bd
Showing
15 changed files
with
724 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
# AuroraWebSpider | ||
为在离线环境下部份无趣的CTF比赛提供相应的数(现)据(学)支(现)持(卖) | ||
1. 定时循环抓取博客链接及外链 | ||
2. 缓存为离线html文件 | ||
3. 提供相应的检索功能 | ||
|
||
TODO: | ||
1. 抓取对应的git文件 | ||
2. 缓存部份附件/PDF | ||
3. 删除无效网页/缓存 | ||
|
||
**本项目仅供学习参考,请勿将服务公开,并且在授权后进行相关操作** | ||
在项目测试中,使用了部份师傅的博客进行测试,感谢他们的支持。 | ||
|
||
## 效果如图 | ||
### 站点界面 | ||
 | ||
### 筛选/搜索界面 | ||
 | ||
### 缓存界面 | ||
 | ||
|
||
### Web Server | ||
可视化前端 | ||
'''shell | ||
python app.py | ||
''' | ||
|
||
### Spider Server | ||
此服务用于递归爬去网站内容 | ||
'''shell | ||
python spider.py | ||
''' | ||
|
||
### Singlefile Server | ||
此服务用于静态页面抓取 | ||
'''shell | ||
npm install -g "single-file-cli" | ||
python Singlefile.py | ||
''' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,160 @@ | ||
from flask import Flask, request,render_template,redirect,url_for,send_from_directory | ||
from db import * | ||
from datetime import datetime | ||
from bs4 import BeautifulSoup | ||
from flask_compress import Compress | ||
from flask_httpauth import HTTPBasicAuth | ||
import socket | ||
|
||
|
||
socket.setdefaulttimeout(120) | ||
app = Flask(__name__) | ||
app.config['SEND_FILE_MAX_AGE_DEFAULT'] = 3600 | ||
app.config['STATIC_FOLDER'] = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'static') | ||
auth = HTTPBasicAuth() | ||
compress = Compress(app) | ||
|
||
@auth.verify_password | ||
def verify_password(username, password): | ||
if password == 'MakeAuroraGreatAgain': | ||
return True | ||
return False | ||
|
||
|
||
@app.route('/static/<path:filename>') | ||
def static_files(filename): | ||
return send_from_directory(app.config['STATIC_FOLDER'], filename) | ||
def isnas(variable): | ||
return variable.isalnum() and variable.islower() | ||
class BreakLoop(Exception): | ||
pass | ||
startwith = ["https://","http://",""] | ||
|
||
@app.route('/show', methods=['GET']) | ||
def show(): | ||
url=request.args.get("url") | ||
if url == "None" or isnas == False: | ||
return redirect(url_for('list')) | ||
file_path = os.path.join("html",url+".html") | ||
with open(file_path,"r") as f: | ||
data=f.read() | ||
try: | ||
cachetime = datetime.fromtimestamp(os.path.getmtime(file_path)).strftime('%Y/%m/%d %H:%M') | ||
db = sqliteDB() | ||
items = db.list_raw() | ||
banner="Aurora 离线缓存" | ||
for item in items: | ||
if item[0] is None or item[1] is None or item[0] == "" or item[1] == "": | ||
items.remove(item) | ||
elif item[1]==url: | ||
banner=f"网站/SITE:{item[2]} 网址/URL:{item[0]} 缓存时间/CACHETIME:{cachetime}" | ||
print(banner) | ||
soup = BeautifulSoup(data, 'html.parser') | ||
tags = soup.find_all() | ||
for tag in tags: | ||
try: | ||
href = tag['href'] | ||
try: | ||
for rep in items: | ||
for start in startwith: | ||
if href == start+rep[0]: | ||
href = f"/show?url={rep[1]}" | ||
raise BreakLoop | ||
except BreakLoop: | ||
tag['href'] = href | ||
|
||
except: | ||
pass | ||
|
||
banner_div = soup.new_tag('div', attrs={'class': 'aurora_banner'}) | ||
banner_div['style'] = 'width: 100%; height: 30px; background-color: #007aff; color: white; text-align: center; position: fixed; top: 0; left: 0; z-index: 9999;' | ||
banner_div.string = banner | ||
body_tag = soup.body | ||
body_tag.insert(0, banner_div) | ||
|
||
data = soup.prettify() | ||
except: | ||
pass | ||
return data | ||
|
||
''' | ||
@app.route('/rebuild', methods=['GET']) | ||
def rebuild(): | ||
return redirect(url_for('list')) | ||
''' | ||
|
||
@app.route('/site', methods=['GET']) | ||
def site(): | ||
db = sqliteDB() | ||
items=db.list_site("") | ||
return render_template('site.html',items=items) | ||
|
||
@app.route('/edit', methods=['GET']) | ||
@auth.login_required | ||
def edit(): | ||
db = sqliteDB() | ||
url=request.args.get("url") | ||
if url is None: | ||
items=[("","","","")] | ||
else: | ||
items=db.list_site(url) | ||
return render_template('edit.html',items=items[0]) | ||
|
||
@app.route('/update', methods=['GET']) | ||
@auth.login_required | ||
def update(): | ||
db = sqliteDB() | ||
url=request.args.get("url") | ||
if url is None: | ||
return redirect(url_for('site')) | ||
items=db.list_site(url) | ||
item=items[0] | ||
db.insert_site(item[0],item[1],-item[2]) | ||
return redirect(url_for('site')) | ||
|
||
@app.route('/del', methods=['GET']) | ||
@auth.login_required | ||
def delsite(): | ||
db = sqliteDB() | ||
url=request.args.get("url") | ||
if url is None: | ||
return redirect(url_for('site')) | ||
db.del_site(url) | ||
return redirect(url_for('site')) | ||
|
||
@app.route('/add', methods=['GET']) | ||
@auth.login_required | ||
def add(): | ||
db = sqliteDB() | ||
site=request.args.get("site") | ||
url=request.args.get("url") | ||
interval=request.args.get("interval") | ||
if interval is None: | ||
interval=30 | ||
try: | ||
interval = int(interval) | ||
except ValueError: | ||
interval = 30 | ||
if url!="" and site!="": | ||
db.insert_site(url, site, interval) | ||
return redirect(url_for('site')) | ||
|
||
@app.route('/list', methods=['GET']) | ||
@app.route('/', methods=['GET']) | ||
def list(): | ||
db = sqliteDB() | ||
summary=request.args.get("search") | ||
type=request.args.get("type") | ||
if summary is None: | ||
summary="" | ||
if type is None: | ||
type="0" | ||
|
||
items=db.list_html(summary,type) | ||
pages=db.list_page() | ||
return render_template('list.html',pages=pages, items=items) | ||
|
||
|
||
if __name__ == '__main__': | ||
app.run(host="0.0.0.0", port=5002, debug=False) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
[ | ||
{ | ||
"time": "2024/1/1 00:00", | ||
"name": "Aurora\u95e8\u6237", | ||
"url": "https://www.szu.moe" | ||
} | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
import os | ||
import sqlite3 | ||
from whoosh.index import create_in,open_dir | ||
from whoosh.fields import * | ||
from whoosh.qparser import QueryParser, MultifieldParser | ||
from whoosh.query import compound, Term | ||
class sqliteDB: | ||
def __init__(self) -> None: | ||
if(os.path.exists('sqlite.db')==False): | ||
self.db = sqlite3.connect('sqlite.db') | ||
print("Building DataBase...") | ||
self.make() | ||
print("Building Database Succeed.") | ||
else: | ||
self.db = sqlite3.connect('sqlite.db') | ||
self.make() | ||
print("Connect Database Succeed.") | ||
self.db.enable_load_extension(True) | ||
self.db.execute('PRAGMA temp_store=MEMORY;') | ||
self.db.execute('PRAGMA journal_mode=MEMORY;') | ||
self.db.execute('PRAGMA auto_vacuum=INCREMENTAL;') | ||
|
||
def make(self) -> None: | ||
self.db.execute('''CREATE TABLE IF NOT EXISTS html( | ||
url VARCHAR(256) PRIMARY KEY, | ||
site VARCHAR(256), | ||
title TEXT, | ||
summary TEXT, | ||
content TEXT, | ||
updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP);''') | ||
self.db.execute('''CREATE TABLE IF NOT EXISTS site( | ||
url VARCHAR(256) PRIMARY KEY, | ||
site VARCHAR(256), | ||
interval INT, | ||
updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP);''') | ||
|
||
self.db.commit() | ||
|
||
def insert_html(self, url, site, title=None, summary=None, content=None): | ||
|
||
self.db.execute("REPLACE INTO html (url, site, title, summary, content) VALUES (?, ?, ?, ?, ?)",(url, site, title, summary, content)) | ||
self.db.commit() | ||
|
||
def insert_site(self, url, site, interval): | ||
cursor = self.db.execute("SELECT site FROM site WHERE url = ?",(url,)) | ||
for i in cursor: | ||
self.db.execute("UPDATE html SET site=? WHERE site=?",(site, i[0])) | ||
self.db.execute("REPLACE INTO site (url, site, interval) VALUES (?, ?, ?)",(url, site, interval)) | ||
self.db.commit() | ||
|
||
def del_site(self,url): | ||
self.db.execute("DELETE FROM site WHERE url=?",(url,)) | ||
self.db.commit() | ||
def random_html(self): | ||
cursor = self.db.execute("SELECT url,site,title FROM html WHERE summary IS NULL ORDER BY RANDOM() LIMIT 1") | ||
for i in cursor: | ||
return i[0],i[1],i[2] | ||
|
||
def show_html(self,url): | ||
cursor = self.db.execute("SELECT url,site,title,content FROM html WHERE url=? LIMIT 1",(url,)) | ||
for i in cursor: | ||
return i[0],i[1],i[2],i[3] | ||
|
||
def list_html(self,summary,type): | ||
if summary=="": | ||
return [] | ||
else: | ||
summary=f"%{summary}%" | ||
if type=="0": | ||
cursor = self.db.execute("SELECT url,site,title,content FROM html WHERE summary like ? OR title like ? OR url like ?",(summary,summary,summary)) | ||
elif type=="1": | ||
cursor = self.db.execute("SELECT url,site,title,content FROM html WHERE title like ?",(summary,)) | ||
elif type=="2": | ||
cursor = self.db.execute("SELECT url,site,title,content FROM html WHERE site like ?",(summary,)) | ||
elif type=="3": | ||
cursor = self.db.execute("SELECT url,site,title,content FROM html WHERE url like ?",(summary,)) | ||
elif type=="4": | ||
cursor = self.db.execute("SELECT url,site,title,content FROM html WHERE summary like ?",(summary,)) | ||
return [(i[0],i[1],i[2],i[3]) for i in cursor] | ||
|
||
def list_site(self,url=""): | ||
if url=="": | ||
cursor = self.db.execute("SELECT url,site,interval,updated FROM site WHERE 1") | ||
else: | ||
cursor = self.db.execute("SELECT url,site,interval,updated FROM site WHERE url=?",(url,)) | ||
return [(i[0],i[1],i[2],i[3]) for i in cursor] | ||
|
||
def list_page(self): | ||
cursor = self.db.execute("SELECT count(*) FROM html WHERE summary IS NOT NULL") | ||
for i in cursor: | ||
return i[0] | ||
|
||
def list_raw(self): | ||
cursor = self.db.execute("SELECT url,content,site,title FROM html WHERE 1") | ||
return [(i[0],i[1],i[2],i[3]) for i in cursor] | ||
|
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
beautifulsoup4==4.12.3 | ||
Flask==3.0.1 | ||
flask_compress==1.14 | ||
Flask_HTTPAuth==4.8.0 | ||
selenium==4.17.2 | ||
whoosh==2.7.4 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
import sys | ||
import subprocess | ||
from bs4 import BeautifulSoup | ||
|
||
import hashlib | ||
import time | ||
from db import * | ||
|
||
def extract_text(html): | ||
soup = BeautifulSoup(html, 'html.parser') | ||
text = soup.get_text(separator='') | ||
return text.replace("\r\n","").replace("\n","") | ||
|
||
def calc_md5(string): | ||
md5_hash = hashlib.md5() | ||
md5_hash.update(string.encode('utf-8')) | ||
return md5_hash.hexdigest() | ||
|
||
def singlefile(url): | ||
chrome_path = '""' | ||
if sys.platform.startswith('darwin'): | ||
chrome_path = '"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"' | ||
elif sys.platform.startswith('win'): | ||
chrome_path = '"C:\Program Files\Google\Chrome\Application\chrome.exe"' | ||
else: | ||
chrome_path = '"/usr/bin/google-chrome"' | ||
command = [ | ||
'single-file', | ||
'--block-scripts=false', | ||
'--browser-executable-path=' + chrome_path, | ||
'--browser-width=1600', | ||
'--browser-height=900', | ||
'--compress-CSS=true', | ||
'--browser-ignore-insecure-certs=true', | ||
'--http-proxy-server="http://local-lab.hz2016.cn:7890"', | ||
#'--http-proxy-username="hz2016"', | ||
#'--http-proxy-password="hz20162333"', | ||
'--save-original-urls=true', | ||
#'--max-resource-size=50', | ||
#'--browser-wait-delay=1000', | ||
#'--browser-load-max-time=60000', | ||
#'--load-deferred-images-max-idle-time=10000', | ||
'--dump-content=true', | ||
url | ||
] | ||
try: | ||
result = subprocess.run(command, capture_output=True, text=True) | ||
return extract_text(result.stdout),result.stdout | ||
except: | ||
return "","" | ||
|
||
db=sqliteDB() | ||
while True: | ||
try: | ||
try: | ||
url,site,title=db.random_html() | ||
except: | ||
time.sleep(300) | ||
continue | ||
print(site,url,title,end="") | ||
summary,content=singlefile("https://"+url) | ||
folder_name = "html" | ||
if not os.path.exists(folder_name): | ||
os.makedirs(folder_name) | ||
content_md5=calc_md5(url) | ||
file_path=os.path.join(folder_name,content_md5+".html") | ||
with open(file_path,"w") as f: | ||
f.write(content) | ||
db.insert_html(url, site, title, summary,content_md5) | ||
print(" SUCCESS") | ||
except: | ||
pass | ||
|
Oops, something went wrong.