Skip to content

Commit

Permalink
refactor: tidy codes
Browse files Browse the repository at this point in the history
Signed-off-by: Jack Cherng <[email protected]>
  • Loading branch information
jfcherng committed Mar 31, 2022
1 parent 3e88b19 commit 5392ba4
Show file tree
Hide file tree
Showing 7 changed files with 92 additions and 133 deletions.
1 change: 0 additions & 1 deletion .python-version

This file was deleted.

58 changes: 27 additions & 31 deletions caac_package/Crawler.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
from .ProjectConfig import ProjectConfig
from .TaskQueue import TaskQueue
from concurrent.futures import ThreadPoolExecutor
from pyquery import PyQuery as pq
from typing import Iterable, List, Optional
import cloudscraper
import codecs
import lxml
import lxml.etree
import os
import re
import sqlite3
Expand All @@ -18,7 +20,7 @@ class Crawler:
collegeListUrl = ""
resultDir = ""

def __init__(self, year, apply_stage, projectBaseUrl=""):
def __init__(self, year: int, apply_stage: str, projectBaseUrl: str = "") -> None:
self.year = year
self.apply_stage = apply_stage
self.resultDir = ProjectConfig.getCrawledResultDir(self.year, self.apply_stage)
Expand All @@ -41,7 +43,7 @@ def __init__(self, year, apply_stage, projectBaseUrl=""):
# -------------- #
self.collegeListUrl = self.projectBaseUrl + "collegeList.htm"

def run(self, showMessage=False):
def run(self, showMessage: bool = False) -> None:
# prepare the result directory
os.makedirs(self.resultDir, exist_ok=True)

Expand All @@ -53,8 +55,8 @@ def run(self, showMessage=False):
if showMessage:
print(f"[Crawler] Files are stored in: {self.resultDir}")

def fetchAndSaveCollegeList(self):
departmentLists = []
def fetchAndSaveCollegeList(self) -> List[str]:
departmentLists: List[str] = []

# the user may give a wrong URL in the last run
# in that case, we overwrite the old file and run again
Expand All @@ -66,49 +68,43 @@ def fetchAndSaveCollegeList(self):
links = pq(content)("a")

for link in links.items():
href = link.attr("href")
href = str(link.attr("href"))
if "common/" in href or "extra/" in href:
departmentLists.append(href)

return departmentLists

def fetchAndSaveDepartmentLists(self, filepaths):
departmentApplys = []
def fetchAndSaveDepartmentLists(self, filepaths: Iterable[str]) -> List[str]:
departmentApplys: List[str] = []

def workerFetchPage(filepath):
def workerFetchPage(filepath: str) -> None:
content = self.fetchAndSavePage(self.projectBaseUrl + filepath, overwrite=False, log=True)
links = pq(content)("a")
for link in links.items():
href = link.attr("href")
href = str(link.attr("href"))
if "apply/" in href:
for prefix in ["common/", "extra/"]:
if prefix in filepath:
departmentApplys.append(self.simplifyUrl(prefix + href))
break

taskQueue = TaskQueue(num_workers=ProjectConfig.CRAWLER_WORKER_NUM)

for filepath in filepaths:
taskQueue.add_task(workerFetchPage, filepath=filepath)

taskQueue.join()
with ThreadPoolExecutor(max_workers=ProjectConfig.CRAWLER_WORKER_NUM) as executor:
for filepath in filepaths:
executor.submit(workerFetchPage, filepath=filepath)

return departmentApplys

def fetchAndSaveDepartmentApplys(self, filepaths):
def workerFetchPage(filepath):
def fetchAndSaveDepartmentApplys(self, filepaths: Iterable[str]) -> None:
def workerFetchPage(filepath: str) -> None:
self.fetchAndSavePage(self.projectBaseUrl + filepath, overwrite=False, log=True)

taskQueue = TaskQueue(num_workers=ProjectConfig.CRAWLER_WORKER_NUM)

for filepath in filepaths:
taskQueue.add_task(workerFetchPage, filepath=filepath)

taskQueue.join()
with ThreadPoolExecutor(max_workers=ProjectConfig.CRAWLER_WORKER_NUM) as executor:
for filepath in filepaths:
executor.submit(workerFetchPage, filepath=filepath)

print("[crawler_caac] Finish crawling.")

def fetchAndSavePage(self, url, overwrite=True, log=False):
def fetchAndSavePage(self, url: str, overwrite: bool = True, log: bool = False) -> str:
"""fetch and save a page depending on its URL"""

filepath = url.replace(self.projectBaseUrl, "")
Expand All @@ -122,11 +118,11 @@ def fetchAndSavePage(self, url, overwrite=True, log=False):
if log is True:
print(f"[Fetch] {url}")

content = self.getPage(url)
content = self.getPage(url) or ""
self.writeFile(filepathAbsolute, content)
return content

def generateDb(self):
def generateDb(self) -> None:
"""generate a db file from crawled html files"""

dbFilepath = ProjectConfig.getCrawledDbFile(self.year, self.apply_stage)
Expand Down Expand Up @@ -249,7 +245,7 @@ def generateDb(self):
print("[crawler_caac] DB Gen: done.")

@classmethod
def getPage(self, url):
def getPage(cls, url: str, retry_s: float = 3.0) -> Optional[str]:
"""get a certain web page"""

while True:
Expand All @@ -268,9 +264,9 @@ def getPage(self, url):
return None

# fail to fetch the page, let's sleep for a while
time.sleep(1)
time.sleep(retry_s)

def writeFile(self, filename, content="", mode="w", codec="utf-8"):
def writeFile(self, filename: str, content: str = "", mode: str = "w", codec: str = "utf-8") -> None:
"""write content to an external file"""

# create directory if the directory does exist yet
Expand All @@ -281,7 +277,7 @@ def writeFile(self, filename, content="", mode="w", codec="utf-8"):
with codecs.open(filename, mode, codec) as f:
f.write(content)

def simplifyUrl(self, url):
def simplifyUrl(self, url: str) -> str:
url = re.sub(r"(^|/)./", r"\1", url)
url = re.sub(r"(?<!:)/{2,}", r"/", url)

Expand Down
100 changes: 47 additions & 53 deletions caac_package/LookupDb.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import Any, Dict, Iterable, Optional
import argparse
import os
import pandas as pd
import sqlite3
Expand All @@ -6,7 +8,7 @@
class LookupDb:

# db handle
conn = None
conn: Optional[sqlite3.Connection] = None

# universityMap = {
# '001': '國立臺灣大學',
Expand All @@ -20,7 +22,7 @@ class LookupDb:
# }
departmentMap = {}

def __init__(self, dbFilepath):
def __init__(self, dbFilepath: str) -> None:
if not os.path.isfile(dbFilepath):
raise Exception(f"DB file does not exist: {dbFilepath}")

Expand Down Expand Up @@ -50,14 +52,14 @@ def loadDb(self):

return self.universityMap, self.departmentMap

def lookupByAdmissionIds(self, admissionIds):
def lookupByAdmissionIds(self, admissionIds: Iterable[str]) -> Dict[str, Any]:
results = {
# '准考證號': [ '系所編號', ... ],
# ...
}

assert self.conn
for admissionId in admissionIds:

cursor = self.conn.execute(
"""
SELECT departmentId
Expand All @@ -72,7 +74,8 @@ def lookupByAdmissionIds(self, admissionIds):

return results

def lookupByDepartmentIds(self, departmentIds):
def lookupByDepartmentIds(self, departmentIds: Iterable[str]) -> Dict[str, Any]:
assert self.conn
cursor = self.conn.execute(
"""
SELECT admissionId
Expand All @@ -87,30 +90,29 @@ def lookupByDepartmentIds(self, departmentIds):

return self.lookupByAdmissionIds(admissionIds)

def writeOutSieveResult(self, outputFile, lookupResult, args):
def writeOutSieveResult(
self,
outputFile: str,
lookupResult: Dict[str, Any],
args: argparse.Namespace,
) -> None:
# output the results (xlsx)
with pd.ExcelWriter(outputFile, engine="xlsxwriter") as writer:
workbook = writer.book

# fmt: off
cellFormat = workbook.add_format({
'align': 'left',
'valign': 'vcenter',
'text_wrap': True,
'font_size': 9,
})
# fmt: on
cellFormat = workbook.add_format(
{
"align": "left",
"valign": "vcenter",
"text_wrap": True,
"font_size": 9,
}
)

worksheet = workbook.add_worksheet("第一階段-篩選結果(甄選委員會)")
worksheet.freeze_panes(1, 1)

# fmt: off
worksheet.write_row(
0, 0,
[ '准考證號', '校名與系所' ],
cellFormat
)
# fmt: on
worksheet.write_row(0, 0, ["准考證號", "校名與系所"], cellFormat)

rowCnt = 1
for admissionId, departmentIds in lookupResult.items():
Expand All @@ -123,17 +125,16 @@ def writeOutSieveResult(self, outputFile, lookupResult, args):
universityId = departmentId[:3]
applieds.append(f"{self.universityMap[universityId]}\n{self.departmentMap[departmentId]}")

# fmt: off
worksheet.write_row(
rowCnt, 0,
[int(admissionId), *applieds],
cellFormat
)
# fmt: on
worksheet.write_row(rowCnt, 0, [int(admissionId), *applieds], cellFormat)

rowCnt += 1

def writeOutSieveResultNthuEe(self, outputFile, lookupResult, args):
def writeOutSieveResultNthuEe(
self,
outputFile: str,
lookupResult: Dict[str, Any],
args: argparse.Namespace,
) -> None:
def nthuSort(departmentId):
universityId = departmentId[:3]

Expand All @@ -146,7 +147,7 @@ def nthuSort(departmentId):
return departmentId

# list unique
ArgsDepartmentIds = list(set(filter(len, args.departmentIds.split(","))))
ArgsDepartmentIds = list(set(filter(None, args.departmentIds.split(","))))

# let's do some post processes
# - we only want to show departments that are not in args.departmentIds
Expand All @@ -161,30 +162,29 @@ def nthuSort(departmentId):

self.writeOutSieveResult(outputFile, postProcessedResults, args)

def writeOutEntranceResult(self, outputFile, lookupResult, args):
def writeOutEntranceResult(
self,
outputFile: str,
lookupResult: Dict[str, Any],
args: argparse.Namespace,
) -> None:
# output the results (xlsx)
with pd.ExcelWriter(outputFile, engine="xlsxwriter") as writer:
workbook = writer.book

# fmt: off
cellFormat = workbook.add_format({
'align': 'left',
'valign': 'vcenter',
'text_wrap': True,
'font_size': 9,
})
# fmt: on
cellFormat = workbook.add_format(
{
"align": "left",
"valign": "vcenter",
"text_wrap": True,
"font_size": 9,
}
)

worksheet = workbook.add_worksheet("第二階段-分發結果(甄選委員會)")
worksheet.freeze_panes(1, 1)

# fmt: off
worksheet.write_row(
0, 0,
[ '准考證號', '分發結果' ],
cellFormat
)
# fmt: on
worksheet.write_row(0, 0, ["准考證號", "分發結果"], cellFormat)

rowCnt = 1
for admissionId, departmentIds in lookupResult.items():
Expand All @@ -197,12 +197,6 @@ def writeOutEntranceResult(self, outputFile, lookupResult, args):
universityId = departmentId[:3]
applieds.append(f"{self.universityMap[universityId]}\n{self.departmentMap[departmentId]}")

# fmt: off
worksheet.write_row(
rowCnt, 0,
[int(admissionId), *applieds],
cellFormat
)
# fmt: on
worksheet.write_row(rowCnt, 0, [int(admissionId), *applieds], cellFormat)

rowCnt += 1
8 changes: 4 additions & 4 deletions caac_package/ProjectConfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,17 +26,17 @@ class ProjectConfig:
CRAWLED_DB_FILENAME = "sqlite3.db"

@classmethod
def getCrawledResultDir(self, year, apply_stage):
def getCrawledResultDir(cls, year: int, apply_stage: str) -> str:
"""Get the crawled result directory for a sepecific year/stage."""

year = Year.taiwanize(year)

return os.path.join(self.CRAWLER_RESULT_DIR.format(year), f"stage_{apply_stage}")
return os.path.join(cls.CRAWLER_RESULT_DIR.format(year), f"stage_{apply_stage}")

@classmethod
def getCrawledDbFile(self, year, apply_stage):
def getCrawledDbFile(cls, year: int, apply_stage: str) -> str:
"""Get the crawled db file for a sepecific year/stage."""

year = Year.taiwanize(year)

return os.path.join(self.getCrawledResultDir(year, apply_stage), self.CRAWLED_DB_FILENAME)
return os.path.join(cls.getCrawledResultDir(year, apply_stage), cls.CRAWLED_DB_FILENAME)
26 changes: 0 additions & 26 deletions caac_package/TaskQueue.py

This file was deleted.

Loading

0 comments on commit 5392ba4

Please sign in to comment.