-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathspscraper.py
591 lines (534 loc) · 23.3 KB
/
spscraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
import requests, copy, re, json, os
from bs4 import BeautifulSoup
missing_ids = []
def get_all_anime():
# URL of the webpage
url = "https://subsplease.org/shows"
base = "https://subsplease.org"
# Send a GET request to the URL
response = requests.get(url)
items_dict = {}
# Check if the request was successful (status code 200)
if response.status_code == 200:
# Parse the HTML content
html_content = response.text
# Create a BeautifulSoup object
soup = BeautifulSoup(html_content, 'html.parser')
# Find all elements with class 'all-shows-link' and get their 'title' attribute
all_shows_links = soup.find_all(class_='all-shows-link')
for link in all_shows_links:
anime_title = link.a['title']
listing_url = base + link.a['href']
matched_url = False
if anime_title not in skip_list:
if anime_title == 'Boku no Hero Academia':#the fandom isn't the only thing that requires special attention ig
anime_info = subspleaseinfo_bh(anime_title)
else:
anime_info = get_data(listing_url)
# Set the proper dictionary entry
try:
dict_entry = anime_info[anime_title]
except:
for entry in anime_info:
anime_title = entry
dict_entry = anime_info[anime_title]
items_dict[anime_title] = dict_entry
else:
print("Failed to retrieve webpage. Status code:", response.status_code)
return items_dict
def subs_to_ani(subs_entry, reverse = False):
from alfetcher import get_id, get_anime_info
def check_manual_adjust(anime_key, manually_adjusted_strings):
if not manually_adjusted_strings:
manually_adjusted_strings = {}
if manually_adjusted_strings and anime_key in manually_adjusted_strings:
checked_id = manually_adjusted_strings[anime_key]
else:
missing_ids.append(anime_key)
return None
checked_id = get_input(f'Type in AniList ID of the anime(SubsPlease match: {anime_key}): ', False, str)
checked_id = checked_id if checked_id else None
manually_adjusted_strings[anime_key] = checked_id if checked_id else None
if manually_adjusted_strings:
save_json(manual_adjustments_path, manually_adjusted_strings, True)
return checked_id
manually_adjusted_strings = read_json(manual_adjustments_path)
new_list = {}
for key in subs_entry:
pattern = re.compile(r's(\d)', re.I)
title = re.sub(pattern, f'Season \\1', key)
anime_id = get_id(title)
if not anime_id:
anime_id = check_manual_adjust(key, manually_adjusted_strings)
if not anime_id: continue
else:
anime_id = str(anime_id)
info = get_anime_info(anime_id)[anime_id]
anime_state = find_key(info, 'status')
if anime_state == 'NOT_YET_RELEASED':
anime_id = check_manual_adjust(key, manually_adjusted_strings)
# Add the value with the new key
new_list[anime_id] = key
if reverse:
return {subs_value:anilist_key for anilist_key, subs_value in new_list.items()} #Returns Subsplease title: AniList ID
else:
return new_list #Returns AniList ID: Subsplease title
def get_ani_id_from_subs_title(subs_entry, title, reverse = False):
manually_adjusted_strings = read_json(manual_adjustments_path)
if manually_adjusted_strings and title in manually_adjusted_strings:
anime_dict = manually_adjusted_strings
else:
anime_dict = subs_to_ani(subs_entry, True)
try:
anime_id = anime_dict[title]
except:
return None
if anime_id:
save_json(conv_dict_path, {anime_id: title}, False)
return anime_id
def create_season_keys(subs_entry):
from alfetcher import get_anime_info
current_cache = load_cache()
subs_list_new = copy.deepcopy(subs_entry)
test_int = None
for key in subs_entry:
try:
test_int = int(key)
ani_key = key
except ValueError:
ani_key = get_ani_id_from_subs_title({key: subs_entry[key]}, key)
if not ani_key: return None
subs_list_new[ani_key] = subs_entry[key]
if not test_int:
del subs_list_new[key]
hasSeason = True
checked_episodes = 0
skipped_episodes = 0
cleared_ids = []
while hasSeason:
anime_id = ani_key
anime_info = get_anime_info(anime_id)[anime_id]
anime_relations = find_key(anime_info, 'related')
previous_episodes = 0
hasSeason = False
if not anime_relations:
break
for relation in anime_relations:
if relation not in cleared_ids:
if anime_relations[relation]['status'] != 'NOT_YET_RELEASED' and anime_relations[relation]['type'] == 'SEQUEL':
anime_relation = relation
hasSeason = True
break
elif anime_relations[relation]['type'] == 'PREQUEL' and relation in current_cache:
previous_episodes = len(current_cache[relation]['nyaasi_links'])
break
if anime_relations:
try:
season_id = str(anime_relation)
except:
season_id = None
if season_id is not None and season_id in subs_list_new and checked_episodes == 0:
break
sub_id = subs_entry[key]['id']
url = f'https://subsplease.org/api/?f=show&tz=Europe/Prague&sid={sub_id}'
headers = {'Content-Type': 'application/json'}
response = requests.get(url, headers=headers)
if response.status_code == 200:
json_response = response.json()['episode']
json_data = {}
if not json_response:
break
# Iterate over the reversed items and populate the new dictionary
for reverse_key, value in reversed(json_response.items()):
json_data[reverse_key] = value
first_key = next(iter(json_data.keys()))
episode_string = json_data[first_key]['episode']
try:
starting_episode = int(episode_string)
except:
if (re.search(r"movie", episode_string, re.I) or
re.search(r"v\d", episode_string, re.I) or #i really dont know what to do about this, just save me
re.search(r"movie", json_data[first_key]['show'], re.I)):
starting_episode = 1
else:
break
links = subs_entry[key]['nyaasi_links']
link_amount = len(subs_entry[key]['nyaasi_links'])
episode_amount = find_key(anime_info, 'total_eps')
if not episode_amount:
sorted_magnets = []
if link_amount - previous_episodes < 1: previous_episodes = 0
for x in range(previous_episodes, link_amount):
try:
sorted_magnets.append(links[x])
x += 1
except:
break
subs_list_new[ani_key]['nyaasi_links'] = sorted_magnets
break
if link_amount > episode_amount:
checked_episodes += previous_episodes
if starting_episode > episode_amount + skipped_episodes:
skipped_episodes += episode_amount
if ani_key not in cleared_ids:
try:
del subs_list_new[ani_key]
cleared_ids.append(ani_key)
except:
cleared_ids.append(ani_key)
ani_key = season_id
save_json(conv_dict_path, {ani_key: key}, False)
continue
sorted_magnets = []
for x in range(0, episode_amount):
try:
sorted_magnets.append(links[x + checked_episodes])
x += 1
except:
break
if starting_episode == 0:
x = x + 1
checked_episodes += x
leftover_episodes = link_amount - checked_episodes
if leftover_episodes > 0 and episode_amount > 1:
subs_list_new[season_id] = copy.deepcopy(subs_entry[key])
leftover_magnets = []
for y in range(checked_episodes, link_amount):
leftover_magnets.append(links[y])
elif leftover_episodes > 0:
subs_list_new[season_id] = copy.deepcopy(subs_entry[key])
leftover_magnets = []
leftover_magnets.append(links[checked_episodes])
try:
subs_list_new[ani_key]['nyaasi_links'] = sorted_magnets
except:
subs_list_new[ani_key] = copy.deepcopy(subs_entry[key])
subs_list_new[ani_key]['nyaasi_links'] = sorted_magnets
cleared_ids.append(ani_key)
if season_id is not None:
try:
subs_list_new[season_id]['nyaasi_links'] = leftover_magnets
except:
hasSeason = False
else:
break
ani_key = season_id
save_json(conv_dict_path, {ani_key: key}, False)
return subs_list_new
def update_list(subs_list):
# URL of the webpage
url = "https://subsplease.org/shows"
base = "https://subsplease.org"
# Send a GET request to the URL
response = requests.get(url)
list_urls = [entry['url'] for entry in subs_list.values()]
not_found = []
# Check if the request was successful (status code 200)
if response.status_code == 200:
# Parse the HTML content
html_content = response.text
# Create a BeautifulSoup object
soup = BeautifulSoup(html_content, 'html.parser')
# Find all elements with class 'all-shows-link' and get their 'title' attribute
all_shows_links = soup.find_all(class_='all-shows-link')
for link in all_shows_links:
anime_title = link.a['title']
item_url = base + link.a['href']
if anime_title not in skip_list and item_url not in list_urls:
data = get_data(item_url)
if data:
anilist_data = create_season_keys(data)
if anilist_data: subs_list.update(anilist_data)
else:
print("Failed to retrieve webpage. Status code:", response.status_code)
return subs_list
def gen_cache():
cleaned_list = load_cache()
final_cache = create_season_keys(cleaned_list)
save_cache(final_cache)
def subspleaseinfo_bh(search_string):
url = "https://subsplease.org/shows"
url_request = f'https://subsplease.org/api/?f=search&tz=Europe/Prague&s={search_string}'
headers = {'Content-Type': 'application/json'}
response = requests.get(url_request, headers=headers)
if response.status_code == 200:
json_data = response.json()
items_dict = {}
items_dict[search_string] = {}
items_dict[search_string]['url'] = url + '/' + json_data[next(iter(json_data.keys()))]['page']
items_dict[search_string]['id'] = get_subsplease_id(items_dict[search_string]['url'])
torrent_link = get_torrent_link_bh(items_dict[search_string]['id'])[0]
skip_list = get_torrent_link_bh(items_dict[search_string]['id'])[1]
items_dict[search_string]['nyaasi_links'] = torrent_link
for skip in skip_list:
items_dict[skip] = {}
items_dict[skip]['url'] = url + '/' + json_data[next(iter(json_data.keys()))]['page']
items_dict[skip]['id'] = get_subsplease_id(items_dict[search_string]['url'])
torrent_link = skip_list[skip]
items_dict[skip]['nyaasi_links'] = torrent_link
return items_dict
#save_cache(items_dict)
def get_data(url):
# Send a GET request to the URL
response = requests.get(url)
# Check if the request was successful (status code 200)
if response.status_code == 200:
# Parse the HTML content
html_content = response.text
# Create a BeautifulSoup object
soup = BeautifulSoup(html_content, 'html.parser')
# Find all elements with class 'all-shows-link' and get their 'title' attribute
table_element = soup.find('table', id='show-release-table')
sid_value = table_element.get('sid')
title = soup.find('h1', class_='entry-title').text
title = re.sub('–', '-', title)
title = re.sub('’', "'", title)
item = {}
item[title] = {}
item[title]['url'] = url
item[title]['id'] = sid_value
torrent_link = get_torrent_link(item[title]['id'])
item[title]['nyaasi_links'] = torrent_link
if not torrent_link:
return None
else:
return item
else:
print("Failed to retrieve webpage. Status code:", response.status_code)
def get_torrent_link(sub_id):
url = f'https://subsplease.org/api/?f=show&tz=Europe/Prague&sid={sub_id}'
headers = {'Content-Type': 'application/json'}
response = requests.get(url, headers=headers)
if response.status_code == 200:
json_data = response.json()
torrent_links = []
if json_data['episode']:
for episode in json_data['episode']:
if '.5' not in episode:
for download in json_data['episode'][episode]['downloads']:
if download['res'] == '1080':
raw_url = download['torrent']
if raw_url.endswith("/torrent"):
raw_url = raw_url[:-len("/torrent")]
torrent_links.insert(0, raw_url)
elif json_data['batch']:
for batch in json_data['batch']:
for download in json_data['batch'][batch]['downloads']:
if download['res'] == '1080':
raw_url = download['torrent']
if raw_url.endswith("/torrent"):
raw_url = raw_url[:-len("/torrent")]
torrent_links.insert(0, raw_url)
else:
torrent_links = None
return torrent_links
def get_torrent_link_bh(sub_id):
url = f'https://subsplease.org/api/?f=show&tz=Europe/Prague&sid={sub_id}'
headers = {'Content-Type': 'application/json'}
response = requests.get(url, headers=headers)
skipped_eps = {}
ona = []
if response.status_code == 200:
json_data = response.json()
torrent_links = []
try:
for episode in json_data['episode']:
for download in json_data['episode'][episode]['downloads']:
if download['res'] == '1080':
raw_url = download['torrent']
if raw_url.endswith("/torrent"):
raw_url = raw_url[:-len("/torrent")]
if not re.search(r"\d", episode, re.I):
if episode == 'Boku no Hero Academia - UA Heroes Battle':
skipped_eps[episode] = raw_url
break
else:
ona.insert(0, raw_url)
if episode == 'Boku no Hero Academia - Hero League Baseball':
skipped_eps[episode] = ona
break
else:
torrent_links.insert(0, raw_url)
break
except KeyError:
for batch in json_data['batch']:
for download in json_data['batch'][batch]['downloads']:
if download['res'] == '1080':
raw_url = download['torrent']
if raw_url.endswith("/torrent"):
raw_url = raw_url[:-len("/torrent")]
torrent_links.insert(0, raw_url)
return torrent_links, skipped_eps
def get_subsplease_id(url):
# URL of the webpage
# Send a GET request to the URL
response = requests.get(url)
# Check if the request was successful (status code 200)
if response.status_code == 200:
# Parse the HTML content
html_content = response.text
# Create a BeautifulSoup object
soup = BeautifulSoup(html_content, 'html.parser')
# Find all elements with class 'all-shows-link' and get their 'title' attribute
table_element = soup.find('table', id='show-release-table')
sid_value = table_element.get('sid')
return sid_value
else:
print("Failed to retrieve webpage. Status code:", response.status_code)
def update_entries(subs_entry):
from alfetcher import get_anime_info
def filter_404_links(links):
for link in links:
response = requests.head(link)
if response.status_code == 404:
return 'reset'
copied_entries = copy.deepcopy(subs_entry)
checked_keys = []
missing_keys = []
for key in subs_entry:
anime_status = find_key(get_anime_info(key), 'status')
try:
remote_key = read_json(conv_dict_path)[key]
except:
missing_keys.append(key)
continue
if anime_status == 'RELEASING' and remote_key not in checked_keys:
entry_url = subs_entry[key]['url']
remote_entry = get_data(entry_url)
modified_entry = create_season_keys({key: remote_entry[remote_key]})
try:
last_url = modified_entry[key]['nyaasi_links'][-1]
except:
print("DEBUG: Error occurred while accessing 'nyaasi_links'")
print(modified_entry[key])
continue
current_urls = copied_entries[key]['nyaasi_links']
try:
if last_url not in current_urls:
current_urls.append(last_url)
except:
current_urls = [last_url]
filtered_urls = filter_404_links(current_urls)
if filtered_urls == 'reset':
filtered_urls = modified_entry[key]['nyaasi_links']
else:
filtered_urls = current_urls
copied_entries[key]['nyaasi_links'] = filtered_urls
return copied_entries
#Utils
def read_json(file_path):
if os.path.exists(file_path):
with open(file_path, "r", encoding="utf-8") as json_file:
data = json.load(json_file)
if data == {}:
return None
else:
return data
else:
return None
def save_json(file_path, data, overwrite = True):
def update_json():
json_copy = read_json(file_path)
if json_copy is None:
json_copy = {}
json_copy.update(data)
with open(file_path, "w", encoding="utf-8") as file:
json.dump(json_copy, file, indent=4, ensure_ascii=False)
json_file = read_json(file_path)
if json_file != None:
if overwrite:
with open(file_path, "w", encoding="utf-8") as file:
json.dump(data, file, indent=4, ensure_ascii=False)
else:
update_json()
else:
update_json()
def yank_anime_entry(name):
ani_list = load_cache()
return {name: ani_list[name]}
def save_cache(data):
save_json(cache_path, data, True)
def load_cache():
cache = read_json(cache_path)
return cache
def find_key(data, key_type):
key_type = key_type.lower()
if isinstance(data, dict):
if key_type in data:
return data[key_type]
else:
for value in data.values():
result = find_key(value, key_type)
if result is not None:
return result
elif isinstance(data, list):
for item in data:
result = find_key(item, key_type)
if result is not None:
return result
return None
def get_input(prompt, lower = True, data_type = str):
while True:
if lower:
user_input = input(prompt).lower()
else:
user_input = input(prompt)
try:
converted_input = data_type(user_input)
return converted_input
except ValueError:
print("Invalid input. Please enter a valid", data_type.__name__)
def check_cache():
from alfetcher import get_anime_info
attention = []
cache = load_cache()
for entry in cache:
anime_data = get_anime_info(entry)
anime_amount = find_key(anime_data, 'total_eps')
anime_status = find_key(anime_data, 'status')
links_amount = len(cache[entry]['nyaasi_links'])
if links_amount != anime_amount and anime_status != 'RELEASING':
attention.append([entry, 'https://anilist.co/anime/' + entry, str(links_amount)+'/'+str(anime_amount)])
for anime_id in attention:
print(anime_id)
def generate_conv_keys():
cache = load_cache()
def get_title(url):
# Send a GET request to the URL
response = requests.get(url)
# Check if the request was successful (status code 200)
if response.status_code == 200:
# Parse the HTML content
html_content = response.text
# Create a BeautifulSoup object
soup = BeautifulSoup(html_content, 'html.parser')
# Find all elements with class 'all-shows-link' and get their 'title' attribute
table_element = soup.find('table', id='show-release-table')
title = soup.find('h1', class_='entry-title').text
title = re.sub('–', '-', title)
title = re.sub('’', "'", title)
return title
else:
print("Failed to retrieve webpage. Status code:", response.status_code)
for key in cache:
save_json(conv_dict_path, {key: get_title(cache[key]['url'])}, False)
conv_dict_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'conv_dict.json')
cache_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'ani_subs.json')
manual_adjustments_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'manual_adjustments.json')
skip_list = ["Lee's Detective Agency",
'Fruits Basket (2019)',
'Fruits Basket (2019) S2',
'Rail Romanesque S2',
'Youjo Senki',
'Mahouka Koukou no Rettousei',
'Tsugumomo S2 OVA',
'Edens Zero',
'Boruto - Naruto Next Generations',
'One Piece']
if __name__ == "__main__":
cache = load_cache()
updated_list = update_list(cache)
updated_entries = update_entries(updated_list)
save_cache(updated_entries)
for missing_entry in missing_ids:
print(missing_entry)