-
Notifications
You must be signed in to change notification settings - Fork 23
/
Copy pathbuild.py
85 lines (68 loc) · 2.67 KB
/
build.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import os
import sys
import csv
import subprocess
import json
from pathlib import Path
def load_state(state_file):
if os.path.exists(state_file):
with open(state_file, 'r') as f:
return json.load(f)
return {}
def save_state(state_file, state):
with open(state_file, 'w') as f:
json.dump(state, f, indent=4)
def process_author(author):
# Define paths
author_dir = Path(f"authors/{author}")
download_dir = Path(f"downloads/{author}")
state_file = author_dir / "state.json"
content_csv = author_dir / "published_content.csv"
# Ensure directories exist
download_dir.mkdir(parents=True, exist_ok=True)
# Load previous state
state = load_state(state_file)
# Read CSV
if not content_csv.exists():
print(f"No published_content.csv found for author {author}.")
return
with open(content_csv, 'r', newline='', encoding='utf-8') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
url = row['URL'].strip()
kind = row['Kind'].strip()
subkind = row.get('SubKind', '').strip() # Read SubKind
if not url: # Skip empty URL
print(f"Skipping empty URL for kind {kind}.")
continue
# Check if already processed
state_key = f"{url}_{kind}_{subkind}" # Include SubKind in state key
if state.get(state_key):
print(f"Skipping {url} as it has already been processed.")
continue
processor_script = Path(f"processors/{kind}_processor.py")
if not processor_script.exists():
print(f"No processor found for kind {kind}. Skipping.")
continue
try:
print(f"Processing {url} with kind {kind} and subkind {subkind}...")
subprocess.run(
[sys.executable, str(processor_script), url, str(download_dir), subkind],
check=True
)
print(f"Successfully processed {url}.")
# Update state
state[state_key] = {"url": url, "kind": kind, "subkind": subkind}
save_state(state_file, state)
except subprocess.CalledProcessError as e:
print(f"Error processing {url} with kind {kind}: {e}")
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: build.py <author>")
sys.exit(1)
author_name = sys.argv[1]
print(f"Building content for author: {author_name}")
try:
process_author(author_name)
except Exception as e:
print(f"An error occurred while processing author {author_name}: {e}")