Skip to content

Commit

Permalink
refine handling of edge cases
Browse files Browse the repository at this point in the history
  • Loading branch information
modhurita committed Feb 1, 2024
1 parent 5374fc4 commit 141f294
Showing 1 changed file with 48 additions and 41 deletions.
89 changes: 48 additions & 41 deletions artscraper/find_artworks.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,10 @@ def get_artist_works(self):
# Find the parent element corresponding to the text heading
parent_element = element.find_element('xpath', '../..')

# Initialize total number of artworks
# (set to number of artworks by artist with the most artworks)
total_num_artworks = 200000

# Find number of artists
# Find elements with tag name 'h3'
items_elements = parent_element.find_elements('tag name', 'h3')
Expand All @@ -219,15 +223,16 @@ def get_artist_works(self):
total_num_artworks = int(match.group())
break

# Initialize number of artworks
num_artworks = 0
# Initialize count of number of iterations for which the number of artworks remains the same
count = 0
# Find right arrow element
def _find_right_arrow_element(parent_element):

right_arrow_element = parent_element.find_element('xpath', \
'.//*[contains(@data-gaaction,"rightArrow")]')

while True:
return right_arrow_element

# Save current number of artworks
old_num_artworks = num_artworks
# Get list of artwork links
def _get_list_links(parent_element):

# Find right arrow button
right_arrow_element = parent_element.find_element('xpath', \
Expand All @@ -240,46 +245,48 @@ def get_artist_works(self):
# Get the links from the XPath elements
list_links = [element.get_attribute('href') for element in elements]

# Calculate new number of artworks
num_artworks = len(list_links)
return list_links

# Click on right arrow
def _click_on_right_arrow(parent_element):

# Find right arrow button
right_arrow_element = parent_element.find_element('xpath', \
'.//*[contains(@data-gaaction,"rightArrow")]')
# Click on right arrow button
self.driver.execute_script("arguments[0].click();", right_arrow_element)

list_links = _get_list_links(parent_element)

# Initialize count of number of iterations for which the number of artworks remains the same
n_tries = 0

while (len(list_links) < total_num_artworks and
not (total_num_artworks == 0 and n_tries > 3)):

# Save current number of artworks
old_num_artworks = len(list_links)

# Find right arrow element
right_arrow_element = _find_right_arrow_element(parent_element)

# Check if right arrow button can still be clicked
if right_arrow_element.get_attribute('tabindex') is not None:
# Find right arrow button
right_arrow_element = parent_element.find_element('xpath', \
'.//*[contains(@data-gaaction,"rightArrow")]')
# Click on right arrow button
self.driver.execute_script("arguments[0].click();", right_arrow_element)

# List of all elements with links to artworks
elements = right_arrow_element.find_elements('xpath', \
'//*[contains(@href,"/asset/")]')

# Get the links from the XPath elements
list_links = [element.get_attribute('href') for element in elements]

# Calculate new number of artworks
num_artworks = len(list_links)

# Check if total number of artworks is reached
if total_num_artworks:
if num_artworks < total_num_artworks:
# Wait for page to load
time.sleep(random_wait_time(min_wait=self.min_wait_time))
continue
# Break out of the while loop if total_num_artworks is reached
break

if num_artworks > old_num_artworks:
# Count number of iterations for which the number of artworks remains the same
count = 0
else:
count = count+1
# Click on right arrow
_click_on_right_arrow(parent_element)

# Try thrice before deciding that there are no more artworks to be scraped
if count > 3:
break
# Wait for page to load
time.sleep(random_wait_time(min_wait=self.min_wait_time))

# Obtain new list of artworks
list_links = _get_list_links(parent_element)

if len(list_links) == old_num_artworks:
# Count number of iterations for which the number of artworks remains the same
n_tries = n_tries + 1
else:
n_tries = 0

return list_links

Expand Down

0 comments on commit 141f294

Please sign in to comment.