Skip to content

Commit

Permalink
First commit
Browse files Browse the repository at this point in the history
  • Loading branch information
huangzheng2016 committed Jan 31, 2024
0 parents commit 6a474bd
Show file tree
Hide file tree
Showing 15 changed files with 724 additions and 0 deletions.
40 changes: 40 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# AuroraWebSpider
为在离线环境下部份无趣的CTF比赛提供相应的数(现)据(学)支(现)持(卖)
1. 定时循环抓取博客链接及外链
2. 缓存为离线html文件
3. 提供相应的检索功能

TODO:
1. 抓取对应的git文件
2. 缓存部份附件/PDF
3. 删除无效网页/缓存

**本项目仅供学习参考,请勿将服务公开,并且在授权后进行相关操作**
在项目测试中,使用了部份师傅的博客进行测试,感谢他们的支持。

## 效果如图
### 站点界面
![](./picture/1.png)
### 筛选/搜索界面
![](./picture/2.png)
### 缓存界面
![](./picture/3.png)

### Web Server
可视化前端
'''shell
python app.py
'''

### Spider Server
此服务用于递归爬去网站内容
'''shell
python spider.py
'''

### Singlefile Server
此服务用于静态页面抓取
'''shell
npm install -g "single-file-cli"
python Singlefile.py
'''
160 changes: 160 additions & 0 deletions app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
from flask import Flask, request,render_template,redirect,url_for,send_from_directory
from db import *
from datetime import datetime
from bs4 import BeautifulSoup
from flask_compress import Compress
from flask_httpauth import HTTPBasicAuth
import socket


socket.setdefaulttimeout(120)
app = Flask(__name__)
app.config['SEND_FILE_MAX_AGE_DEFAULT'] = 3600
app.config['STATIC_FOLDER'] = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'static')
auth = HTTPBasicAuth()
compress = Compress(app)

@auth.verify_password
def verify_password(username, password):
if password == 'MakeAuroraGreatAgain':
return True
return False


@app.route('/static/<path:filename>')
def static_files(filename):
return send_from_directory(app.config['STATIC_FOLDER'], filename)
def isnas(variable):
return variable.isalnum() and variable.islower()
class BreakLoop(Exception):
pass
startwith = ["https://","http://",""]

@app.route('/show', methods=['GET'])
def show():
url=request.args.get("url")
if url == "None" or isnas == False:
return redirect(url_for('list'))
file_path = os.path.join("html",url+".html")
with open(file_path,"r") as f:
data=f.read()
try:
cachetime = datetime.fromtimestamp(os.path.getmtime(file_path)).strftime('%Y/%m/%d %H:%M')
db = sqliteDB()
items = db.list_raw()
banner="Aurora 离线缓存"
for item in items:
if item[0] is None or item[1] is None or item[0] == "" or item[1] == "":
items.remove(item)
elif item[1]==url:
banner=f"网站/SITE:{item[2]} 网址/URL:{item[0]} 缓存时间/CACHETIME:{cachetime}"
print(banner)
soup = BeautifulSoup(data, 'html.parser')
tags = soup.find_all()
for tag in tags:
try:
href = tag['href']
try:
for rep in items:
for start in startwith:
if href == start+rep[0]:
href = f"/show?url={rep[1]}"
raise BreakLoop
except BreakLoop:
tag['href'] = href

except:
pass

banner_div = soup.new_tag('div', attrs={'class': 'aurora_banner'})
banner_div['style'] = 'width: 100%; height: 30px; background-color: #007aff; color: white; text-align: center; position: fixed; top: 0; left: 0; z-index: 9999;'
banner_div.string = banner
body_tag = soup.body
body_tag.insert(0, banner_div)

data = soup.prettify()
except:
pass
return data

'''
@app.route('/rebuild', methods=['GET'])
def rebuild():
return redirect(url_for('list'))
'''

@app.route('/site', methods=['GET'])
def site():
db = sqliteDB()
items=db.list_site("")
return render_template('site.html',items=items)

@app.route('/edit', methods=['GET'])
@auth.login_required
def edit():
db = sqliteDB()
url=request.args.get("url")
if url is None:
items=[("","","","")]
else:
items=db.list_site(url)
return render_template('edit.html',items=items[0])

@app.route('/update', methods=['GET'])
@auth.login_required
def update():
db = sqliteDB()
url=request.args.get("url")
if url is None:
return redirect(url_for('site'))
items=db.list_site(url)
item=items[0]
db.insert_site(item[0],item[1],-item[2])
return redirect(url_for('site'))

@app.route('/del', methods=['GET'])
@auth.login_required
def delsite():
db = sqliteDB()
url=request.args.get("url")
if url is None:
return redirect(url_for('site'))
db.del_site(url)
return redirect(url_for('site'))

@app.route('/add', methods=['GET'])
@auth.login_required
def add():
db = sqliteDB()
site=request.args.get("site")
url=request.args.get("url")
interval=request.args.get("interval")
if interval is None:
interval=30
try:
interval = int(interval)
except ValueError:
interval = 30
if url!="" and site!="":
db.insert_site(url, site, interval)
return redirect(url_for('site'))

@app.route('/list', methods=['GET'])
@app.route('/', methods=['GET'])
def list():
db = sqliteDB()
summary=request.args.get("search")
type=request.args.get("type")
if summary is None:
summary=""
if type is None:
type="0"

items=db.list_html(summary,type)
pages=db.list_page()
return render_template('list.html',pages=pages, items=items)


if __name__ == '__main__':
app.run(host="0.0.0.0", port=5002, debug=False)

7 changes: 7 additions & 0 deletions data.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
[
{
"time": "2024/1/1 00:00",
"name": "Aurora\u95e8\u6237",
"url": "https://www.szu.moe"
}
]
96 changes: 96 additions & 0 deletions db.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import os
import sqlite3
from whoosh.index import create_in,open_dir
from whoosh.fields import *
from whoosh.qparser import QueryParser, MultifieldParser
from whoosh.query import compound, Term
class sqliteDB:
def __init__(self) -> None:
if(os.path.exists('sqlite.db')==False):
self.db = sqlite3.connect('sqlite.db')
print("Building DataBase...")
self.make()
print("Building Database Succeed.")
else:
self.db = sqlite3.connect('sqlite.db')
self.make()
print("Connect Database Succeed.")
self.db.enable_load_extension(True)
self.db.execute('PRAGMA temp_store=MEMORY;')
self.db.execute('PRAGMA journal_mode=MEMORY;')
self.db.execute('PRAGMA auto_vacuum=INCREMENTAL;')

def make(self) -> None:
self.db.execute('''CREATE TABLE IF NOT EXISTS html(
url VARCHAR(256) PRIMARY KEY,
site VARCHAR(256),
title TEXT,
summary TEXT,
content TEXT,
updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP);''')
self.db.execute('''CREATE TABLE IF NOT EXISTS site(
url VARCHAR(256) PRIMARY KEY,
site VARCHAR(256),
interval INT,
updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP);''')

self.db.commit()

def insert_html(self, url, site, title=None, summary=None, content=None):

self.db.execute("REPLACE INTO html (url, site, title, summary, content) VALUES (?, ?, ?, ?, ?)",(url, site, title, summary, content))
self.db.commit()

def insert_site(self, url, site, interval):
cursor = self.db.execute("SELECT site FROM site WHERE url = ?",(url,))
for i in cursor:
self.db.execute("UPDATE html SET site=? WHERE site=?",(site, i[0]))
self.db.execute("REPLACE INTO site (url, site, interval) VALUES (?, ?, ?)",(url, site, interval))
self.db.commit()

def del_site(self,url):
self.db.execute("DELETE FROM site WHERE url=?",(url,))
self.db.commit()
def random_html(self):
cursor = self.db.execute("SELECT url,site,title FROM html WHERE summary IS NULL ORDER BY RANDOM() LIMIT 1")
for i in cursor:
return i[0],i[1],i[2]

def show_html(self,url):
cursor = self.db.execute("SELECT url,site,title,content FROM html WHERE url=? LIMIT 1",(url,))
for i in cursor:
return i[0],i[1],i[2],i[3]

def list_html(self,summary,type):
if summary=="":
return []
else:
summary=f"%{summary}%"
if type=="0":
cursor = self.db.execute("SELECT url,site,title,content FROM html WHERE summary like ? OR title like ? OR url like ?",(summary,summary,summary))
elif type=="1":
cursor = self.db.execute("SELECT url,site,title,content FROM html WHERE title like ?",(summary,))
elif type=="2":
cursor = self.db.execute("SELECT url,site,title,content FROM html WHERE site like ?",(summary,))
elif type=="3":
cursor = self.db.execute("SELECT url,site,title,content FROM html WHERE url like ?",(summary,))
elif type=="4":
cursor = self.db.execute("SELECT url,site,title,content FROM html WHERE summary like ?",(summary,))
return [(i[0],i[1],i[2],i[3]) for i in cursor]

def list_site(self,url=""):
if url=="":
cursor = self.db.execute("SELECT url,site,interval,updated FROM site WHERE 1")
else:
cursor = self.db.execute("SELECT url,site,interval,updated FROM site WHERE url=?",(url,))
return [(i[0],i[1],i[2],i[3]) for i in cursor]

def list_page(self):
cursor = self.db.execute("SELECT count(*) FROM html WHERE summary IS NOT NULL")
for i in cursor:
return i[0]

def list_raw(self):
cursor = self.db.execute("SELECT url,content,site,title FROM html WHERE 1")
return [(i[0],i[1],i[2],i[3]) for i in cursor]

Binary file added picture/1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added picture/2.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added picture/3.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
6 changes: 6 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
beautifulsoup4==4.12.3
Flask==3.0.1
flask_compress==1.14
Flask_HTTPAuth==4.8.0
selenium==4.17.2
whoosh==2.7.4
73 changes: 73 additions & 0 deletions singlefile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import sys
import subprocess
from bs4 import BeautifulSoup

import hashlib
import time
from db import *

def extract_text(html):
soup = BeautifulSoup(html, 'html.parser')
text = soup.get_text(separator='')
return text.replace("\r\n","").replace("\n","")

def calc_md5(string):
md5_hash = hashlib.md5()
md5_hash.update(string.encode('utf-8'))
return md5_hash.hexdigest()

def singlefile(url):
chrome_path = '""'
if sys.platform.startswith('darwin'):
chrome_path = '"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"'
elif sys.platform.startswith('win'):
chrome_path = '"C:\Program Files\Google\Chrome\Application\chrome.exe"'
else:
chrome_path = '"/usr/bin/google-chrome"'
command = [
'single-file',
'--block-scripts=false',
'--browser-executable-path=' + chrome_path,
'--browser-width=1600',
'--browser-height=900',
'--compress-CSS=true',
'--browser-ignore-insecure-certs=true',
'--http-proxy-server="http://local-lab.hz2016.cn:7890"',
#'--http-proxy-username="hz2016"',
#'--http-proxy-password="hz20162333"',
'--save-original-urls=true',
#'--max-resource-size=50',
#'--browser-wait-delay=1000',
#'--browser-load-max-time=60000',
#'--load-deferred-images-max-idle-time=10000',
'--dump-content=true',
url
]
try:
result = subprocess.run(command, capture_output=True, text=True)
return extract_text(result.stdout),result.stdout
except:
return "",""

db=sqliteDB()
while True:
try:
try:
url,site,title=db.random_html()
except:
time.sleep(300)
continue
print(site,url,title,end="")
summary,content=singlefile("https://"+url)
folder_name = "html"
if not os.path.exists(folder_name):
os.makedirs(folder_name)
content_md5=calc_md5(url)
file_path=os.path.join(folder_name,content_md5+".html")
with open(file_path,"w") as f:
f.write(content)
db.insert_html(url, site, title, summary,content_md5)
print(" SUCCESS")
except:
pass

Loading

0 comments on commit 6a474bd

Please sign in to comment.