First commit

SZUINSA · Jan 31, 2024 · 6a474bd · 6a474bd
commit 6a474bd
Show file tree

Hide file tree

Showing 15 changed files with 724 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -0,0 +1,40 @@
+# AuroraWebSpider
+为在离线环境下部份无趣的CTF比赛提供相应的数(现)据(学)支(现)持(卖)
+1. 定时循环抓取博客链接及外链
+2. 缓存为离线html文件
+3. 提供相应的检索功能
+
+TODO：
+1. 抓取对应的git文件
+2. 缓存部份附件/PDF
+3. 删除无效网页/缓存
+
+**本项目仅供学习参考，请勿将服务公开，并且在授权后进行相关操作**
+在项目测试中，使用了部份师傅的博客进行测试，感谢他们的支持。
+
+## 效果如图
+### 站点界面
+![](./picture/1.png)
+### 筛选/搜索界面
+![](./picture/2.png)
+### 缓存界面
+![](./picture/3.png)
+
+### Web Server
+可视化前端
+'''shell
+python app.py
+'''
+
+### Spider Server
+此服务用于递归爬去网站内容
+'''shell
+python spider.py
+'''
+
+### Singlefile Server
+此服务用于静态页面抓取
+'''shell
+npm install -g "single-file-cli"
+python Singlefile.py
+'''
diff --git a/app.py b/app.py
@@ -0,0 +1,160 @@
+from flask import Flask, request,render_template,redirect,url_for,send_from_directory
+from db import *
+from datetime import datetime
+from bs4 import BeautifulSoup
+from flask_compress import Compress
+from flask_httpauth import HTTPBasicAuth
+import socket
+
+
+socket.setdefaulttimeout(120)
+app = Flask(__name__)
+app.config['SEND_FILE_MAX_AGE_DEFAULT'] = 3600
+app.config['STATIC_FOLDER'] = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'static')
+auth = HTTPBasicAuth()
+compress = Compress(app)
+
+@auth.verify_password
+def verify_password(username, password):
+    if password == 'MakeAuroraGreatAgain':
+        return True
+    return False
+
+
+@app.route('/static/<path:filename>')
+def static_files(filename):
+    return send_from_directory(app.config['STATIC_FOLDER'], filename)
+def isnas(variable):
+    return variable.isalnum() and variable.islower()
+class BreakLoop(Exception):
+    pass
+startwith = ["https://","http://",""]
+
+@app.route('/show', methods=['GET'])
+def show():
+    url=request.args.get("url")
+    if url == "None" or isnas == False:
+        return redirect(url_for('list'))
+    file_path = os.path.join("html",url+".html")
+    with open(file_path,"r") as f:
+        data=f.read()
+        try:
+            cachetime = datetime.fromtimestamp(os.path.getmtime(file_path)).strftime('%Y/%m/%d %H:%M')
+            db = sqliteDB()
+            items = db.list_raw()
+            banner="Aurora 离线缓存"
+            for item in items:
+                if item[0] is None or item[1] is None or item[0] == "" or item[1] == "":
+                    items.remove(item)
+                elif item[1]==url:
+                    banner=f"网站/SITE：{item[2]} 网址/URL：{item[0]} 缓存时间/CACHETIME：{cachetime}"
+            print(banner)
+            soup = BeautifulSoup(data, 'html.parser')
+            tags = soup.find_all()
+            for tag in tags:
+                try:
+                    href = tag['href']
+                    try:
+                        for rep in items:
+                            for start in startwith:
+                                if href == start+rep[0]:
+                                    href = f"/show?url={rep[1]}"
+                                    raise BreakLoop
+                    except BreakLoop:
+                            tag['href'] = href
+
+                except:
+                    pass
+
+            banner_div = soup.new_tag('div', attrs={'class': 'aurora_banner'})
+            banner_div['style'] = 'width: 100%; height: 30px; background-color: #007aff; color: white; text-align: center; position: fixed;  top: 0; left: 0; z-index: 9999;'
+            banner_div.string = banner
+            body_tag = soup.body
+            body_tag.insert(0, banner_div)
+
+            data = soup.prettify()
+        except:
+            pass
+    return data
+
+'''
+@app.route('/rebuild', methods=['GET'])
+def rebuild():
+    return redirect(url_for('list'))
+'''
+
+@app.route('/site', methods=['GET'])
+def site():
+    db = sqliteDB()
+    items=db.list_site("")
+    return render_template('site.html',items=items)
+
+@app.route('/edit', methods=['GET'])
+@auth.login_required
+def edit():
+    db = sqliteDB()
+    url=request.args.get("url")
+    if url is None:
+        items=[("","","","")]
+    else:
+        items=db.list_site(url)
+    return render_template('edit.html',items=items[0])
+
+@app.route('/update', methods=['GET'])
+@auth.login_required
+def update():
+    db = sqliteDB()
+    url=request.args.get("url")
+    if url is None:
+        return redirect(url_for('site'))
+    items=db.list_site(url)
+    item=items[0]
+    db.insert_site(item[0],item[1],-item[2])
+    return redirect(url_for('site'))
+
+@app.route('/del', methods=['GET'])
+@auth.login_required
+def delsite():
+    db = sqliteDB()
+    url=request.args.get("url")
+    if url is None:
+        return redirect(url_for('site'))
+    db.del_site(url)
+    return redirect(url_for('site'))
+
+@app.route('/add', methods=['GET'])
+@auth.login_required
+def add():
+    db = sqliteDB()
+    site=request.args.get("site")
+    url=request.args.get("url")
+    interval=request.args.get("interval")
+    if interval is None:
+        interval=30
+    try:
+        interval = int(interval)
+    except ValueError:
+        interval = 30
+    if url!="" and site!="":
+        db.insert_site(url, site, interval)
+    return redirect(url_for('site'))
+
+@app.route('/list', methods=['GET'])
+@app.route('/', methods=['GET'])
+def list():
+    db = sqliteDB()
+    summary=request.args.get("search")
+    type=request.args.get("type")
+    if summary is None:
+        summary=""
+    if type is None:
+        type="0"
+
+    items=db.list_html(summary,type)
+    pages=db.list_page()
+    return render_template('list.html',pages=pages, items=items)
+
+
+if __name__ == '__main__':
+    app.run(host="0.0.0.0", port=5002, debug=False)
+
diff --git a/data.json b/data.json
@@ -0,0 +1,7 @@
+[
+    {
+        "time": "2024/1/1 00:00",
+        "name": "Aurora\u95e8\u6237",
+        "url": "https://www.szu.moe"
+    }
+]
diff --git a/db.py b/db.py
@@ -0,0 +1,96 @@
+import os
+import sqlite3
+from whoosh.index import create_in,open_dir
+from whoosh.fields import *
+from whoosh.qparser import QueryParser, MultifieldParser
+from whoosh.query import compound, Term
+class sqliteDB:
+    def __init__(self) -> None:
+        if(os.path.exists('sqlite.db')==False):
+            self.db = sqlite3.connect('sqlite.db')
+            print("Building DataBase...")
+            self.make()
+            print("Building Database Succeed.")
+        else:
+            self.db = sqlite3.connect('sqlite.db')
+            self.make()
+            print("Connect Database Succeed.")
+        self.db.enable_load_extension(True)
+        self.db.execute('PRAGMA temp_store=MEMORY;')
+        self.db.execute('PRAGMA journal_mode=MEMORY;')
+        self.db.execute('PRAGMA auto_vacuum=INCREMENTAL;')
+
+    def make(self) -> None:
+        self.db.execute('''CREATE TABLE IF NOT EXISTS html(
+                url VARCHAR(256) PRIMARY KEY,
+                site VARCHAR(256),
+                title TEXT,
+                summary TEXT,
+                content TEXT,
+                updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP);''')
+        self.db.execute('''CREATE TABLE IF NOT EXISTS site(
+                url VARCHAR(256) PRIMARY KEY,
+                site VARCHAR(256),
+                interval INT,
+                updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP);''')
+
+        self.db.commit()
+
+    def insert_html(self, url, site, title=None, summary=None, content=None):
+
+        self.db.execute("REPLACE INTO html (url, site, title, summary, content) VALUES (?, ?, ?, ?, ?)",(url, site, title, summary, content))
+        self.db.commit()
+
+    def insert_site(self, url, site, interval):
+        cursor = self.db.execute("SELECT site FROM site WHERE url = ?",(url,))
+        for i in cursor:
+            self.db.execute("UPDATE html SET site=? WHERE site=?",(site, i[0]))
+        self.db.execute("REPLACE INTO site (url, site, interval) VALUES (?, ?, ?)",(url, site, interval))
+        self.db.commit()
+
+    def del_site(self,url):
+        self.db.execute("DELETE FROM site WHERE url=?",(url,))
+        self.db.commit()
+    def random_html(self):
+        cursor = self.db.execute("SELECT url,site,title FROM html WHERE summary IS NULL ORDER BY RANDOM() LIMIT 1")
+        for i in cursor:
+            return i[0],i[1],i[2]
+
+    def show_html(self,url):
+        cursor = self.db.execute("SELECT url,site,title,content FROM html WHERE url=? LIMIT 1",(url,))
+        for i in cursor:
+            return i[0],i[1],i[2],i[3]
+
+    def list_html(self,summary,type):
+        if summary=="":
+            return []
+        else:
+            summary=f"%{summary}%"
+            if type=="0":
+                cursor = self.db.execute("SELECT url,site,title,content FROM html WHERE summary like ? OR title like ? OR url like ?",(summary,summary,summary))
+            elif type=="1":
+                cursor = self.db.execute("SELECT url,site,title,content FROM html WHERE title like ?",(summary,))
+            elif type=="2":
+                cursor = self.db.execute("SELECT url,site,title,content FROM html WHERE site like ?",(summary,))
+            elif type=="3":
+                cursor = self.db.execute("SELECT url,site,title,content FROM html WHERE url like ?",(summary,))
+            elif type=="4":
+                cursor = self.db.execute("SELECT url,site,title,content FROM html WHERE summary like ?",(summary,))
+            return [(i[0],i[1],i[2],i[3]) for i in cursor]
+
+    def list_site(self,url=""):
+        if url=="":
+            cursor = self.db.execute("SELECT url,site,interval,updated FROM site WHERE 1")
+        else:
+            cursor = self.db.execute("SELECT url,site,interval,updated FROM site WHERE url=?",(url,))
+        return [(i[0],i[1],i[2],i[3]) for i in cursor]
+
+    def list_page(self):
+        cursor = self.db.execute("SELECT count(*) FROM html WHERE summary IS NOT NULL")
+        for i in cursor:
+            return i[0]
+
+    def list_raw(self):
+        cursor = self.db.execute("SELECT url,content,site,title FROM html WHERE 1")
+        return [(i[0],i[1],i[2],i[3]) for i in cursor]
+
diff --git a/picture/1.png b/picture/1.png
diff --git a/picture/2.png b/picture/2.png
diff --git a/picture/3.png b/picture/3.png
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,6 @@
+beautifulsoup4==4.12.3
+Flask==3.0.1
+flask_compress==1.14
+Flask_HTTPAuth==4.8.0
+selenium==4.17.2
+whoosh==2.7.4
diff --git a/singlefile.py b/singlefile.py
@@ -0,0 +1,73 @@
+import sys
+import subprocess
+from bs4 import BeautifulSoup
+
+import hashlib
+import time
+from db import *
+
+def extract_text(html):
+    soup = BeautifulSoup(html, 'html.parser')
+    text = soup.get_text(separator='')
+    return text.replace("\r\n","").replace("\n","")
+
+def calc_md5(string):
+    md5_hash = hashlib.md5()
+    md5_hash.update(string.encode('utf-8'))
+    return md5_hash.hexdigest()
+
+def singlefile(url):
+    chrome_path = '""'
+    if sys.platform.startswith('darwin'):
+        chrome_path = '"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"'
+    elif sys.platform.startswith('win'):
+        chrome_path = '"C:\Program Files\Google\Chrome\Application\chrome.exe"'
+    else:
+        chrome_path = '"/usr/bin/google-chrome"'
+    command = [
+        'single-file',
+        '--block-scripts=false',
+        '--browser-executable-path=' + chrome_path,
+        '--browser-width=1600',
+        '--browser-height=900',
+        '--compress-CSS=true',
+        '--browser-ignore-insecure-certs=true',
+        '--http-proxy-server="http://local-lab.hz2016.cn:7890"',
+        #'--http-proxy-username="hz2016"',
+        #'--http-proxy-password="hz20162333"',
+        '--save-original-urls=true',
+        #'--max-resource-size=50',
+        #'--browser-wait-delay=1000',
+        #'--browser-load-max-time=60000',
+        #'--load-deferred-images-max-idle-time=10000',
+        '--dump-content=true',
+        url
+    ]
+    try:
+        result = subprocess.run(command, capture_output=True, text=True)
+        return extract_text(result.stdout),result.stdout
+    except:
+        return "",""
+
+db=sqliteDB()
+while True:
+    try:
+        try:
+            url,site,title=db.random_html()
+        except:
+            time.sleep(300)
+            continue
+        print(site,url,title,end="")
+        summary,content=singlefile("https://"+url)
+        folder_name = "html"
+        if not os.path.exists(folder_name):
+            os.makedirs(folder_name)
+        content_md5=calc_md5(url)
+        file_path=os.path.join(folder_name,content_md5+".html")
+        with open(file_path,"w") as f:
+            f.write(content)
+        db.insert_html(url, site, title, summary,content_md5)
+        print(" SUCCESS")
+    except:
+        pass
+