-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwin32_api_scraper.py
100 lines (78 loc) · 3.61 KB
/
win32_api_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import json
import os
from typing import Dict, List, Optional
import requests
import tqdm.contrib.concurrent
from bs4 import BeautifulSoup
class Win32APIScraper():
BASE_ROOT_URL = 'https://learn.microsoft.com/en-us'
BASE_API_URL = os.path.join(BASE_ROOT_URL, 'windows/win32/api')
def check_and_setup_output_path(self, path: Optional[str]) -> None:
if path:
self.output_path = path
else:
self.output_path = os.path.join(os.path.realpath(__file__), "..", "output.json")
# Let the exception bubble up if we can't open the path to write to
with open(self.output_path, 'w'):
return
def __init__(self, output_path: Optional[str] = None) -> None:
self.headers_to_collect = 0
self.session = requests.Session()
self.check_and_setup_output_path(output_path)
def get_function_signature(self, function_details: Dict[str, str]) -> None:
url = os.path.join(self.BASE_ROOT_URL, function_details['href'][1:])
resp = self.session.get(url)
if resp.status_code != 200:
raise Exception(f'Failed to retireve data from: {url}')
soup = BeautifulSoup(resp.text, self.html_handler)
try:
code_tag = soup.find_all('code', class_="lang-cpp")[0]
function_details['signature'] = ' '.join(code_tag.string.split())
function_details['name'] = function_details['toc_title'][:-9]
del function_details['href']
del function_details['toc_title']
# Thrown when can't find a <code> tag on the page
except IndexError:
print(f'Failed to extract signature from: {url}')
return
def scrape_headers_for_functions(self, index:int, name: str, path: str) -> Dict[str, str]:
resp = self.session.get(os.path.join(self.BASE_API_URL, path, 'toc.json'))
if resp.status_code != 200:
raise Exception(f'Unexpected response code {resp.status_code} when attempting to GET {name}')
toc_json = resp.json()
functions = []
for func_entry in toc_json['items'][0]['children']:
if title := func_entry.get('toc_title'):
if title.endswith(' function'):
functions.append(func_entry)
desc = f'({index+1}/{self.headers_to_collect}) {name}'
unit=' functions'
if functions:
tqdm.contrib.concurrent.thread_map(self.get_function_signature, functions, unit=unit, desc=desc)
else:
print(f'{desc}: 0{unit} to collect')
return functions # type: ignore
def get_headers_list(self) -> List[Dict[str, str]]:
resp = self.session.get(os.path.join(self.BASE_API_URL, 'toc.json'))
if resp.status_code != 200:
raise Exception("Invalid response when attempting to access toc.json")
json_content = resp.json()
return json_content['items'][0]['children'][1]['children'] # type: ignore
def scrape(self) -> None:
headers_list = self.get_headers_list()
self.headers_to_collect = len(headers_list)
content = {}
try:
import lxml
self.html_handler = 'lxml'
except ImportError:
self.html_handler = 'html.parser'
for index, header in enumerate(headers_list):
content[header['toc_title']] = self.scrape_headers_for_functions(index, header['toc_title'], header['href'])
with open(self.output_path, 'w') as fp:
json.dump(content, fp, sort_keys=True, indent=4)
def main() -> None:
scraper = Win32APIScraper()
scraper.scrape()
if __name__ == '__main__':
main()