Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Resolves issue #14 #17

Merged
merged 3 commits into from
May 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
204 changes: 91 additions & 113 deletions artscraper/find_artworks.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,17 @@
get_artist_metadata: Get metadata of the artist, from Wikidata

'''

# Allow __init__ function to have more than 5 arguments
#pylint: disable-msg=too-many-arguments

from pathlib import Path

import time
import re
import requests

from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException

import wikipediaapi

Expand All @@ -25,7 +29,7 @@ class FindArtworks:
'''

def __init__(self, artist_link, executable_path='geckodriver',
output_dir='./data', min_wait_time=5):
output_dir='./data', sparql_query= None, min_wait_time=5):

# Link to artist's Google Arts & Culture webpage
self.artist_link = artist_link
Expand All @@ -40,52 +44,61 @@ def __init__(self, artist_link, executable_path='geckodriver',
self.min_wait_time = min_wait_time

# SPARQL query to fetch metadata from wikidata
self.sparql_query = '''
SELECT
?familyName ?familyNameLabel
?givenName ?givenNameLabel
?sexOrGender ?sexOrGenderLabel
?dateOfBirth ?dateOfBirthLabel
?placeOfBirth ?placeOfBirthLabel
?coordinatesBirth ?coordinatesBirthLabel
?latitudeBirth ?latitudeBirthLabel
?longitudeBirth ?longitudeBirthLabel
?dateOfDeath ?dateOfDeathLabel
?placeOfDeath ?placeOfDeathLabel
?coordinatesDeath ?coordinatesDeathLabel
?latitudeDeath ?latitudeDeathLabel
?longitudeDeath ?longitudeDeathLabel
?countryOfCitizenship ?countryOfCitizenshipLabel
?residence ?residenceLabel
?workLocation ?workLocationLabel
?genre ?genreLabel
?movement ?movementLabel
WHERE {
OPTIONAL { wd:person_id wdt:P734 ?familyName. }
OPTIONAL { wd:person_id wdt:P735 ?givenName. }
OPTIONAL { wd:person_id wdt:P21 ?sexOrGender. }
OPTIONAL { wd:person_id wdt:P569 ?dateOfBirth. }
OPTIONAL { wd:person_id wdt:P19 ?placeOfBirth. }
OPTIONAL {
?placeOfBirth wdt:P625 ?coordinatesBirth.
BIND(geof:latitude(?coordinatesBirth) AS ?latitudeBirth)
BIND(geof:longitude(?coordinatesBirth) AS ?longitudeBirth)
}
OPTIONAL { wd:person_id wdt:P570 ?dateOfDeath. }
OPTIONAL { wd:person_id wdt:P20 ?placeOfDeath. }
OPTIONAL {
?placeOfDeath wdt:P625 ?coordinatesDeath.
BIND(geof:latitude(?coordinatesDeath) AS ?latitudeDeath)
BIND(geof:longitude(?coordinatesDeath) AS ?longitudeDeath)
}
OPTIONAL { wd:person_id wdt:P27 ?countryOfCitizenship. }
OPTIONAL { wd:person_id wdt:P551 ?residence. }
OPTIONAL { wd:person_id wdt:P937 ?workLocation. }
OPTIONAL { wd:person_id wdt:P136 ?genre. }
OPTIONAL { wd:person_id wdt:P135 ?movement. }
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}
'''
if sparql_query is None:
# Default SPARQL query
self.sparql_query = '''
SELECT
?familyName ?familyNameLabel
?givenName ?givenNameLabel
?sexOrGender ?sexOrGenderLabel
?dateOfBirth ?dateOfBirthLabel
?placeOfBirth ?placeOfBirthLabel
?latitudeOfPlaceOfBirth ?latitudeOfPlaceOfBirthLabel
?longitudeOfPlaceOfBirth ?longitudeOfPlaceOfBirthLabel
?dateOfDeath ?dateOfDeathLabel
?placeOfDeath ?placeOfDeathLabel
?latitudeOfPlaceOfDeath ?latitudeOfPlaceOfDeathLabel
?longitudeOfPlaceOfDeath ?longitudeOfPlaceOfDeathLabel
?countryOfCitizenship ?countryOfCitizenshipLabel
?residence ?residenceLabel
?workLocation ?workLocationLabel
?genre ?genreLabel
?movement ?movementLabel
WHERE {
OPTIONAL { wd:person_id wdt:P734 ?familyName. }
OPTIONAL { wd:person_id wdt:P735 ?givenName. }
OPTIONAL { wd:person_id wdt:P21 ?sexOrGender. }
OPTIONAL {
wd:person_id wdt:P569 ?dateTimeOfBirth.
BIND (xsd:date(?dateTimeOfBirth) AS ?dateOfBirth)
}
OPTIONAL { wd:person_id wdt:P19 ?placeOfBirth. }
OPTIONAL {
?placeOfBirth wdt:P625 ?coordinatesBirth.
BIND(geof:latitude(?coordinatesBirth) AS ?latitudeOfPlaceOfBirth)
BIND(geof:longitude(?coordinatesBirth) AS ?longitudeOfPlaceOfBirth)
}
OPTIONAL {
wd:person_id wdt:P569 ?dateTimeOfDeath.
BIND (xsd:date(?dateTimeOfDeath) AS ?dateOfDeath)
}
OPTIONAL { wd:person_id wdt:P20 ?placeOfDeath. }
OPTIONAL {
?placeOfDeath wdt:P625 ?coordinatesDeath.
BIND(geof:latitude(?coordinatesDeath) AS ?latitudeOfPlaceOfDeath)
BIND(geof:longitude(?coordinatesDeath) AS ?longitudeOfPlaceOfDeath)
}
OPTIONAL { wd:person_id wdt:P27 ?countryOfCitizenship. }
OPTIONAL { wd:person_id wdt:P551 ?residence. }
OPTIONAL { wd:person_id wdt:P937 ?workLocation. }
OPTIONAL { wd:person_id wdt:P136 ?genre. }
OPTIONAL { wd:person_id wdt:P135 ?movement. }
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}
'''
else:
self.sparql_query = sparql_query

# Open web browser
self.driver = webdriver.Firefox(executable_path=self.executable_path)

Expand All @@ -98,6 +111,7 @@ def __exit__(self, _exc_type, _exc_val, _exc_tb):
# Close web browser
self.driver.close()


def get_artist_information(self):

'''
Expand All @@ -124,13 +138,16 @@ def save_artist_information(self):
artist_works, artist_description, artist_metadata = self.get_artist_information()
artist_name = self.get_wikipedia_article_title()

# Create directory for artist
pathname_directory = self.output_dir + '/' + artist_name
Path(pathname_directory).mkdir(parents=True, exist_ok=True)

# Filenames for artist's works, description, metadata
artist_works_file = pathname_directory + '/' + 'works.txt'
artist_description_file = pathname_directory + '/' + 'description.txt'
artist_metadata_file = pathname_directory + '/' + 'metadata.txt'

# Save artist's works, description, metadata
with open(artist_works_file, 'w', encoding='utf-8') as file:
for link in artist_works:
file.write(f'{link}\n')
Expand Down Expand Up @@ -159,33 +176,22 @@ def get_artist_works(self):
# Find the parent element corresponding to the text heading
parent_element = element.find_element('xpath', '../..')

try:
# Find element which contains number of artworks in the text
element_with_num_items = parent_element.find_element('xpath',
'//h3[contains(text(), "items")]')
# Extract text containing total number of artworks for this artist
num_items_text = element_with_num_items.text
# Total number of artworks for this artist
num_items = int(num_items_text.split(' ')[0])

except NoSuchElementException:
# Less than 1 page of images:
# No right-scrolling needed
num_items =1

#Initialize list of elements with links to artworks
elements = []
# Find right arrow button
right_arrow_element = parent_element.find_element('xpath', \
'.//*[contains(@data-gaaction,"rightArrow")]')

while len(elements) < num_items:
# Check if right arrow button can still be clicked
while right_arrow_element.get_attribute('tabindex') is not None:
# Find right arrow button
right_arrow_element = parent_element.find_element('xpath', \
'.//*[contains(@data-gaaction,"rightArrow")]')
# Click on right arrow button
self.driver.execute_script("arguments[0].click();", right_arrow_element)
# Wait for page to load
time.sleep(random_wait_time(min_wait=self.min_wait_time))
# List of all elements with links to artworks, at this stage
elements = right_arrow_element.find_elements('xpath', \

# List of all elements with links to artworks
elements = right_arrow_element.find_elements('xpath', \
'//*[contains(@href,"/asset/")]')

# Get the links from the XPath elements
Expand Down Expand Up @@ -233,37 +239,25 @@ def get_artist_metadata(self):
query = self.sparql_query.replace('person_id', artist_id)

# Send query request
request = requests.get(url, params= {'format': 'json', 'query': ''.join(query)}, timeout=10)
request = requests.get(url, params= {'format': 'json', 'query': ''.join(query)}, timeout=30)

# Convert response to dictionary
data = request.json()

# Extract properties searched by the SPARQL query
properties_query = re.findall(r"\?[^\s]*Label\b", self.sparql_query)

# Remove redundant prefix and suffix
properties = [property.removeprefix('?').removesuffix('Label') \
for property in properties_query]

# Assemble metadata in a dictionary
metadata = {
'family name': self._get_single_valued_property(data, 'familyName'),
'given name': self._get_single_valued_property(data, 'givenName'),
'sex or gender': self._get_single_valued_property(data, 'sexOrGender'),
'date of birth': self._get_single_valued_property(data, 'dateOfBirth').rsplit('T')[0],
'place of birth': self._get_single_valued_property(data, 'placeOfBirth'),
'latitude of place of birth' :
float(self._get_single_valued_property(data, 'latitudeBirth')),
'longitude of place of birth' :
float(self._get_single_valued_property(data, 'longitudeBirth')),
'date of death': self._get_single_valued_property(data, 'dateOfDeath').rsplit('T')[0],
'place of death': self._get_single_valued_property(data, 'placeOfDeath'),
'latitude of place of death' :
float(self._get_single_valued_property(data, 'latitudeDeath')),
'longitude of place of death' :
float(self._get_single_valued_property(data, 'longitudeDeath')),
'country of citizenship': self._get_multi_valued_property(data, 'countryOfCitizenship'),
'residence': self._get_multi_valued_property(data, 'residence'),
'work location': self._get_multi_valued_property(data, 'workLocation'),
'genre': self._get_multi_valued_property(data, 'genre'),
'movement': self._get_multi_valued_property(data, 'movement')
}
metadata = {re.sub(r'(\B[A-Z])', r' \1', property).lower(): \
self._get_property(data, property) for property in properties}

return metadata


def get_wikipedia_article_link(self):

'''
Expand Down Expand Up @@ -329,37 +323,17 @@ def get_artist_wikidata_id(self):

return wikidata_id

def _get_single_valued_property(self, data, query_property):

'''
Parameters
----------
data : Data, in JSON format, fetched from Wikidata query
query_property : Property to be extracted from data

Returns
-------
output_property : Value of extracted property
'''

if query_property+'Label' in data['results']['bindings'][0].keys():
output_property = data['results']['bindings'][0][query_property+'Label']['value']
else:
output_property = ''

return output_property

def _get_multi_valued_property(self, data, query_property):
def _get_property(self, data, query_property):

'''
Parameters
----------
data : Data, in JSON format, fetched from Wikidata query
query_property : Property to be extracted from data

Returns
-------
output_property_list : List of values of extracted property
output_property_list : Value(s) of extracted property
'''

output_property_list = []
Expand All @@ -369,5 +343,9 @@ def _get_multi_valued_property(self, data, query_property):
# Avoid duplicates
if output_property not in output_property_list:
output_property_list.append(output_property)
if len(output_property_list)==1:
output_property_list = output_property_list[0]
return output_property_list

return output_property_list
# Property doesn't exist
return ''
Loading