-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathaudible.py
101 lines (86 loc) · 3.79 KB
/
audible.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import time
import csv
from selenium import webdriver
from selenium.common import NoSuchElementException
from selenium.webdriver.common.by import By
chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option("detach", True)
driver = webdriver.Chrome(options=chrome_options)
list_of_books = []
def total_pages(page):
"""Get the total number of pages on website."""
driver.get(f"https://www.audible.com/search?keywords=book&node=18573211011&page={page}")
div_next_page = driver.find_element(By.ID, "center-4")
un_list_4 = div_next_page.find_element(By.TAG_NAME, "ul")
lists_of_pages = un_list_4.find_elements(By.CLASS_NAME, "bc-list-item")[-2]
totalPages = int(lists_of_pages.text.strip())
return totalPages
def go_to_next_page(page):
"""Go to the next page"""
driver.get(f"https://www.audible.com/search?keywords=book&node=18573211011&page={page}")
def scrape_page():
"""Scrape data from the current page."""
try:
div_center = driver.find_element(By.ID, "center-3")
unordered_list = div_center.find_element(By.TAG_NAME, "ul")
lists = unordered_list.find_elements(By.CSS_SELECTOR, ".bc-list-item.productListItem")
for book_no, list_ in enumerate(lists):
book_info = {
"title": '',
"images": '',
"subtitle": '',
"author": '',
"narrator": '',
"runtime": '',
"release_date": '',
"language": '',
"ratings": ''
}
# Define field names and their corresponding CSS selectors
fields = {
"title": "li h3",
"images": "picture img",
"subtitle": ".bc-list-item.subtitle",
"price": f"buybox-regular-price-{book_no}",
"author": ".bc-list-item.authorLabel",
"narrator": ".bc-list-item.narratorLabel",
"runtime": ".bc-list-item.runtimeLabel",
"release_date": ".bc-list-item.releaseDateLabel",
"language": ".bc-list-item.languageLabel",
"ratings": ".bc-list-item.ratingsLabel"
}
for field, selector in fields.items():
try:
# Use CSS selector to find the element and extract text or attribute
if field == "images":
book_info[field] = list_.find_element(By.CSS_SELECTOR, selector).get_attribute('src')
elif field == "price":
para = list_.find_element(By.ID, selector)
book_info[field] = para.find_elements(By.TAG_NAME, "span")[1].text.strip()
else:
book_info[field] = list_.find_element(By.CSS_SELECTOR, selector).text.strip()
except NoSuchElementException:
book_info[field] = '' # Handle missing fields
list_of_books.append(book_info)
except NoSuchElementException:
print("Could not find the book list on this page.")
pages = total_pages(1)
for current_page in range(1, pages + 1):
scrape_page()
if current_page == pages:
break
go_to_next_page(current_page + 1)
time.sleep(3)
print(list_of_books)
csv_file = "books_data.csv"
csv_columns = ["title", "images", "subtitle", "price", "author", "narrator", "runtime", "release_date", "language",
"ratings"]
try:
with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
writer.writeheader()
for data in list_of_books:
writer.writerow(data)
print(f"Data successfully written to {csv_file}")
except IOError:
print("I/O error occurred while writing to CSV file")