Skip to content

Commit

Permalink
fix(SoFIFA): update parsing of player profile page (#794)
Browse files Browse the repository at this point in the history
Fixes #793
Fixes #791
  • Loading branch information
probberechts authored Jan 21, 2025
1 parent fb8d537 commit 65bb5a5
Showing 1 changed file with 13 additions and 6 deletions.
19 changes: 13 additions & 6 deletions soccerdata/sofifa.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,11 @@
import pandas as pd
from lxml import html

from ._common import BaseRequestsReader, add_standardized_team_name, standardize_colnames
from ._common import (
BaseRequestsReader,
add_standardized_team_name,
standardize_colnames,
)
from ._config import DATA_DIR, NOCACHE, NOSTORE, TEAMNAME_REPLACEMENTS, logger

SO_FIFA_DATADIR = DATA_DIR / "SoFIFA"
Expand Down Expand Up @@ -108,7 +112,7 @@ def read_leagues(self) -> pd.DataFrame:
leagues.append(
{
"league_id": child["id"],
"league": f'[{child["nationName"]}] {child["value"]}',
"league": f"[{child['nationName']}] {child['value']}",
}
)
return (
Expand Down Expand Up @@ -462,15 +466,18 @@ def read_player_ratings(

# extract scores one-by-one
tree = html.parse(reader, parser=html.HTMLParser(encoding="utf8"))
node_player_name = tree.xpath("//div[contains(@class, 'profile')]/h1")[0]
# Extract what is before <br>
before_br = node_player_name.xpath("string(./text()[1])").strip()
# Extract what is after <br>
after_br = node_player_name.xpath("string(./br/following-sibling::text()[1])").strip()
scores = {
"player": tree.xpath("//div[contains(@class, 'profile')]/h1")[0].text.strip(),
"player": before_br if before_br else after_br,
**version.to_dict(),
}
for s in score_labels:
nodes = tree.xpath(
"(//li[not(self::script)] | //div | //p)"
f"[.//text()[contains(.,'{s}')]]"
"/em"
f"(//li[not(self::script)] | //div | //p)[.//text()[contains(.,'{s}')]]//em"
)
# for multiple matches, only accept first match
if len(nodes) >= 1:
Expand Down

0 comments on commit 65bb5a5

Please sign in to comment.