fix(SoFIFA): update parsing of player profile page (#794)

Fixes #793 Fixes #791
probberechts · Jan 21, 2025 · 65bb5a5 · 65bb5a5
1 parent fb8d537
commit 65bb5a5
Showing 1 changed file with 13 additions and 6 deletions.
diff --git a/soccerdata/sofifa.py b/soccerdata/sofifa.py
@@ -10,7 +10,11 @@
 import pandas as pd
 from lxml import html
 
-from ._common import BaseRequestsReader, add_standardized_team_name, standardize_colnames
+from ._common import (
+    BaseRequestsReader,
+    add_standardized_team_name,
+    standardize_colnames,
+)
 from ._config import DATA_DIR, NOCACHE, NOSTORE, TEAMNAME_REPLACEMENTS, logger
 
 SO_FIFA_DATADIR = DATA_DIR / "SoFIFA"
@@ -108,7 +112,7 @@ def read_leagues(self) -> pd.DataFrame:
                 leagues.append(
                     {
                         "league_id": child["id"],
-                        "league": f'[{child["nationName"]}] {child["value"]}',
+                        "league": f"[{child['nationName']}] {child['value']}",
                     }
                 )
         return (
@@ -462,15 +466,18 @@ def read_player_ratings(
 
             # extract scores one-by-one
             tree = html.parse(reader, parser=html.HTMLParser(encoding="utf8"))
+            node_player_name = tree.xpath("//div[contains(@class, 'profile')]/h1")[0]
+            # Extract what is before <br>
+            before_br = node_player_name.xpath("string(./text()[1])").strip()
+            # Extract what is after <br>
+            after_br = node_player_name.xpath("string(./br/following-sibling::text()[1])").strip()
             scores = {
-                "player": tree.xpath("//div[contains(@class, 'profile')]/h1")[0].text.strip(),
+                "player": before_br if before_br else after_br,
                 **version.to_dict(),
             }
             for s in score_labels:
                 nodes = tree.xpath(
-                    "(//li[not(self::script)] | //div | //p)"
-                    f"[.//text()[contains(.,'{s}')]]"
-                    "/em"
+                    f"(//li[not(self::script)] | //div | //p)[.//text()[contains(.,'{s}')]]//em"
                 )
                 # for multiple matches, only accept first match
                 if len(nodes) >= 1: