-
Notifications
You must be signed in to change notification settings - Fork 22
/
Copy pathfind_repositories.py
126 lines (103 loc) · 4.62 KB
/
find_repositories.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
from api import get
from db import get_db
import time
from functools import reduce
from datetime import datetime, date
def find_repositories():
conn = get_db()
cur = conn.cursor()
try:
result = []
for topic, start_year, start_month, end_year, end_month in get_jobs(cur):
for page_index in range(1, 11):
url = gen_url(topic, start_year, start_month,
end_year, end_month, page_index)
result.append(url)
json = url_to_database(cur, url)
skip_next_page = should_skip_next_page(
json, topic, start_year, start_month, end_year, end_month, page_index)
if skip_next_page:
break
cur.execute(
f"INSERT INTO job_log (topic, start_year, start_month, end_year, end_month, fetched_at) VALUES ('{topic}', {start_year}, {start_month}, {end_year}, {end_month}, now()) ON DUPLICATE KEY UPDATE fetched_at=now()")
except Exception:
raise
finally:
cur.close()
conn.close()
return result
def get_jobs(cur):
all_args = set(map(tuple, iter_page()))
cur.execute("select topic, start_year, start_month, end_year, end_month from job_log where fetched_at > date_sub(curdate(), interval 24 hour)")
recently_fetched_args = set(map(tuple, cur.fetchall()))
job_args = all_args - recently_fetched_args
return list(job_args)
def test_get_jobs():
conn = get_db()
cur = conn.cursor()
get_jobs(cur)
def url_to_database(cur, url):
json = get(url).json()
update_database(cur, json)
return json
def test_url_to_database():
conn = get_db()
cur = conn.cursor()
url = "https://api.github.com/search/repositories?q=topic:portfolio-website+created:2020-02-01..2020-02-29&page=1&per_page=100"
url_to_database(cur, url)
def iter_page():
def iter_page_topic(topic):
yield topic, 2008, 1, 2016, 1
for start_year in range(2016, 9999):
for start_month, end_month in zip([1, 4, 7, 10], [4, 7, 10, 1]):
if date(start_year, start_month, 1) > date.today():
return
end_year = start_year if end_month > start_month else start_year + 1
yield topic, start_year, start_month, end_year, end_month
for topic in ["portfolio-website", 'personal-website']:
yield from iter_page_topic(topic)
def test_iter_page():
for output in iter_page():
print(output)
def gen_url(topic, start_year, start_month, end_year, end_month, page_index):
start_range = f"{start_year}-{str(start_month).zfill(2)}-01.." if start_year else "%3C"
end_range = f"{end_year}-{str(end_month).zfill(2)}-01"
return f"https://api.github.com/search/repositories?q=topic:{topic}+created:{start_range}{end_range}&page={page_index}&per_page=100"
def update_database(cur, json):
response_to_column_mapping = {
'id': ['id'],
'user': ['owner', 'login'],
'repository': ['name'],
'repository_updated_at': ['updated_at'],
'stars': ['stargazers_count'],
'forks': ['forks_count'],
'url': ['homepage'],
}
formatter_funcs = {
'id': int,
'repository_updated_at': lambda t: datetime.strptime(t, "%Y-%m-%dT%H:%M:%SZ"),
'stars': int,
'forks': int,
'url': lambda url: url if url else None,
}
record = {}
for repository in json['items']:
for column, json_keys in response_to_column_mapping.items():
record[column] = reduce(
lambda seq, key: seq[key], json_keys, repository)
record[column] = formatter_funcs[column](
record[column]) if column in formatter_funcs else record[column]
cur.execute("INSERT INTO portfolios (id, user, repository, repository_updated_at, stars, forks, url, api_fetched_at) VALUES (%(id)s, %(user)s, %(repository)s, %(repository_updated_at)s, %(stars)s, %(forks)s, %(url)s, NOW()) ON DUPLICATE KEY UPDATE id=%(id)s, user=%(user)s, repository=%(repository)s, repository_updated_at=%(repository_updated_at)s, stars=%(stars)s, forks=%(forks)s, url=%(url)s, api_fetched_at=NOW()", record)
def should_skip_next_page(json, topic, start_year, start_month, end_year, end_month, page_index):
total_count = json["total_count"]
items_count = len(json['items'])
print(topic, [start_year, start_month, end_year, end_month],
page_index, [total_count, items_count])
if items_count < 100:
return True
elif total_count == page_index * 100:
return True
else:
return False
if __name__ == '__main__':
find_repositories()