Skip to content

Commit

Permalink
fix wikipedia-related issues (#27)
Browse files Browse the repository at this point in the history
  • Loading branch information
modhurita authored Sep 11, 2023
1 parent b6a9f47 commit 05ccad3
Show file tree
Hide file tree
Showing 2 changed files with 422 additions and 35 deletions.
119 changes: 84 additions & 35 deletions artscraper/find_artworks.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,11 @@
'''

# Allow __init__ function to have more than 5 arguments
#pylint: disable-msg=too-many-arguments

from pathlib import Path

import time
import re
from urllib.parse import urlparse
from urllib.parse import unquote
import json
import requests
Expand All @@ -30,7 +28,10 @@ class FindArtworks:
'''
Class for finding artworks and metadata for an artist,
given the link to their Google Arts & Culture webpage
'''
unquote '''

# Allow __init__ function to have more than 5 arguments
# pylint: disable-msg=too-many-arguments

def __init__(self, artist_link,
output_dir='./data', sparql_query= None, min_wait_time=5):
Expand Down Expand Up @@ -77,7 +78,7 @@ def __init__(self, artist_link,
wd:person_id wdt:P569 ?dateTimeOfBirth.
BIND (xsd:date(?dateTimeOfBirth) AS ?dateOfBirth)
}
OPTIONAL {
OPTIONAL {
wd:person_id wdt:P19 ?placeOfBirth.
?placeOfBirth wdt:P625 ?coordinatesBirth.
BIND(geof:latitude(?coordinatesBirth) AS ?latitudeOfPlaceOfBirth)
Expand All @@ -87,7 +88,7 @@ def __init__(self, artist_link,
wd:person_id wdt:P570 ?dateTimeOfDeath.
BIND (xsd:date(?dateTimeOfDeath) AS ?dateOfDeath)
}
OPTIONAL {
OPTIONAL {
wd:person_id wdt:P20 ?placeOfDeath.
?placeOfDeath wdt:P625 ?coordinatesDeath.
BIND(geof:latitude(?coordinatesDeath) AS ?latitudeOfPlaceOfDeath)
Expand Down Expand Up @@ -142,26 +143,53 @@ def save_artist_information(self):
'''

artist_works, artist_description, artist_metadata = self.get_artist_information()
artist_name = self.get_wikipedia_article_title()

artist_name = self.get_artist_name()
# Create directory for artist
pathname_directory = self.output_dir + '/' + artist_name
Path(pathname_directory).mkdir(parents=True, exist_ok=True)

# Filenames for artist's works, description, metadata
artist_works_file = pathname_directory + '/' + 'works.txt'
artist_description_file = pathname_directory + '/' + 'description.txt'
artist_metadata_file = pathname_directory + '/' + 'metadata.json'

# Save artist's works, description, metadata
with open(artist_works_file, 'w', encoding='utf-8') as file:
for link in artist_works:
file.write(f'{link}\n')
with open(artist_description_file, 'w', encoding='utf-8') as file:
file.write(artist_description)
if artist_description is not None:
file.write(artist_description)
with open(artist_metadata_file, 'w', encoding='utf-8') as file:
json.dump(artist_metadata, file, ensure_ascii=False)
if artist_metadata is not None:
json.dump(artist_metadata, file, ensure_ascii=False)


def get_artist_name(self):

'''
Return artist's name, with parts thereof being separated by underscores
'''

# Artist link
artist_link = self.artist_link

# Split the link by forward slashes
parts = artist_link.split('/')
# Extract the artist's name (separated by dashes)
artist_name_string = parts[4]
artist_name_string = unquote(artist_name_string)

# Split the artist's name into component parts
artist_name_parts = artist_name_string.split('-')
# Capitalize each component
artist_name_capitalized_parts = []
for part in artist_name_parts:
part = part.capitalize()
artist_name_capitalized_parts.append(part)

# Artist's name, separated by underscores
artist_name = ('_').join(artist_name_capitalized_parts)

return artist_name

def get_artist_works(self):

Expand Down Expand Up @@ -216,8 +244,20 @@ def get_artist_description(self):

# Get title of artist's Wikipedia article
title = self.get_wikipedia_article_title()
# Choose the English Wikipedia
wiki = wikipediaapi.Wikipedia('en')

# Return None if no Wikipedia article exists
if title is None:
return None

# Get link to Wikipedia article
wikipedia_article_link = self.get_wikipedia_article_link()
# Parse the URL
parsed_url = urlparse(wikipedia_article_link)
# Find the language of the Wikipedia article
language_code = parsed_url.netloc.split('.')[0]

# Choose the Wikipedia corresponding to the language code
wiki = wikipediaapi.Wikipedia(language_code)
# Get the Wikipedia page
page = wiki.page(title)
# Get summary of the page (lead section of the Wikipedia article)
Expand All @@ -238,6 +278,8 @@ def get_artist_metadata(self):

# Get Wikidata ID of artist
artist_id = self.get_artist_wikidata_id()
if artist_id is None:
return None

# Wikidata database to query
url = 'https://query.wikidata.org/sparql'
Expand Down Expand Up @@ -277,9 +319,14 @@ def get_wikipedia_article_link(self):

# Get Google Arts & Culture webpage for the artist
self.driver.get(self.artist_link)
# Allow bare-except
# pylint: disable=W0702
try:
# Locate the element containing the link to the artist's Wikipedia article
element = self.driver.find_element('xpath','//*[contains(@href,"wikipedia")]')
except:
return None

# Locate the element containing the link to the artist's Wikipedia article
element = self.driver.find_element('xpath','//*[contains(@href,"wikipedia")]')
# Extract the link to the Wikipedia article
wikipedia_link = element.get_attribute('href')

Expand All @@ -297,12 +344,13 @@ def get_wikipedia_article_title(self):
# Get the link to the artist's Wikipedia article
wikipedia_link = self.get_wikipedia_article_link()

# Get title of artist's Wikipedia article
title = wikipedia_link.rsplit('/')[-1]

title = unquote(title)
if wikipedia_link is not None:
# Get title of artist's Wikipedia article
title = wikipedia_link.rsplit('/')[-1]
title = unquote(title)
return title

return title
return None

def get_artist_wikidata_id(self):

Expand All @@ -312,24 +360,25 @@ def get_artist_wikidata_id(self):
wikidata_id: Wikidata ID of the artist
'''

# Get Google Arts & Culture webpage for the artist
self.driver.get(self.artist_link)

# Locate the element containing the link to the artist's Wikipedia article
element = self.driver.find_element('xpath','//*[contains(@href,"wikipedia")]')
# Extract the link to the Wikipedia article
wikipedia_link = element.get_attribute('href')
# Get the link to the Wikipedia article
wikipedia_link = self.get_wikipedia_article_link()

# Get Wikipedia page for the artist
self.driver.get(wikipedia_link)
# Find element containing text about Wikidata
element = self.driver.find_element('xpath','//*[contains(text(),"Wikidata item")]')
# Find parent element of this element
parent_element = element.find_element('xpath', '..')
if wikipedia_link is not None:
self.driver.get(wikipedia_link)
else:
return None

# Find element containing the link to the Wikidata page
element = self.driver.find_element('xpath','//*[contains(@href,"www.wikidata.org")]')
# Extract the link to the Wikidata page
wikidata_link = parent_element.get_attribute('href')
wikidata_link = element.get_attribute('href')
# Specify pattern of Wikidata ID
pattern = r'Q\d+'
# Search for pattern in the Wikidata link
match = re.search(pattern, wikidata_link)
# Find the Wikidata ID of the artist
wikidata_id = wikidata_link.rsplit('/')[-1]
wikidata_id = match.group(0)

return wikidata_id

Expand Down
Loading

0 comments on commit 05ccad3

Please sign in to comment.