-
Notifications
You must be signed in to change notification settings - Fork 30
/
Copy pathcrawl.py
78 lines (67 loc) · 2.38 KB
/
crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import json
import os
import shutil
import tempfile
from bs4 import BeautifulSoup
from slugify import slugify
from constants import GOOGLE_DOC_INDEX_ID, GOOGLE_DOC_URL_REGEXP
from export import (
download_document,
download_drawings,
prepare_docx,
convert,
)
def get_index() -> dict:
"""
Find all Google Docs links in documentation index
"""
document_map = {GOOGLE_DOC_INDEX_ID: 'index.md'}
index_html = download_document(GOOGLE_DOC_INDEX_ID, fmt='html')
soup = BeautifulSoup(index_html, 'html.parser')
for elem in soup.find_all('a'):
match = GOOGLE_DOC_URL_REGEXP.search(elem['href'])
if not match:
continue
document_id = match.group(1)
if document_id in document_map:
continue
if document_id == '1Ia4zYmkB6I6IbWPRlcZYYuMS1ZI55T99dp9LiMJqXCE':
# Temporarily not available
continue
document_title = elem.get_text()
if document_title == 'H':
# Bad markup here
document_title = 'How Inventories Work'
document_slug = slugify(document_title, separator='_')
filename = f'{document_slug}.md'
document_map[document_id] = filename
with open('index.json', 'w') as index_json:
json.dump(document_map, index_json, indent=4)
print(f'Found {len(document_map)} documents')
return document_map
def main():
"""
Export and convert all found documents
"""
# Remove old docs before converting
for filename in os.listdir('docs'):
if filename in ['js', 'css', 'api_reference']:
continue
filepath = os.path.join('docs', filename)
if os.path.isfile(filepath):
os.remove(filepath)
else:
shutil.rmtree(filepath)
documents = get_index()
for document_id, filename in documents.items():
print(f'Processing https://docs.google.com/document/d/{document_id}/')
with tempfile.NamedTemporaryFile() as document_file, \
tempfile.TemporaryDirectory() as drawings_dir:
download_document(document_id, file_name=document_file.name)
download_drawings(document_id, drawings_dir)
document = prepare_docx(document_file.name, drawings_dir)
output_path = os.path.join('docs', filename)
convert(document, output_path)
print('Done.')
if __name__ == '__main__':
main()