-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextractor.py
78 lines (52 loc) · 2.53 KB
/
extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from urllib.parse import urlparse
from tqdm import tqdm
import requests
import gzip
import re
import os
import argparse
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0"}
DOWNLOAD_PREFIX = "https://tenhou.net/sc/raw/dat/"
def extract_logs(is_old: bool, log_dir: str, date: str | None):
old_query = "?old" if is_old else ""
r = requests.get(f"https://tenhou.net/sc/raw/list.cgi{old_query}", headers=headers)
atag = re.compile(r"<a\s+href=[\"'](?P<href>.*?)[\"']")
r.raise_for_status()
text = r.text.replace("list([\r\n", "").replace(");", "")
files = text.split(",\r\n")
for archive_item in files:
if "html" in archive_item:
archive_name = archive_item.split("',")[0].replace("{file:'", "")
file_name = archive_name
url = os.path.join(DOWNLOAD_PREFIX, file_name)
# ファイル名から日付(yyyymmdd)を取り出す
dtstr = re.search(r"\d{8}", file_name)
if dtstr is None:
continue
if not (date is None or dtstr[0].startswith(date)):
print(f"skip {file_name}")
continue
print(f"Downloading {file_name}")
page = requests.get(url, headers=headers)
page.raise_for_status()
data = gzip.decompress(page.content).decode("utf-8")
lines = data.split("\r\n")
for line in tqdm(lines):
m = atag.search(line)
if m is not None:
href = m.group("href")
u = urlparse(href)
log_id = u.query.split("=")[1]
url = f"https://tenhou.net/0/log/?{log_id}"
os.makedirs(os.path.join(log_dir, dtstr[0]), exist_ok=True)
# ファイルの存在チェック
filepath = os.path.join(log_dir, dtstr[0], f"{log_id}.xml")
log_file = requests.get(url, headers=headers)
with open(filepath, "w") as f:
f.write(log_file.text)
parser = argparse.ArgumentParser(description="tenho mahjong log etl tool")
parser.add_argument("--old", help="log file download from old or latest archive", action="store_true")
parser.add_argument("--output-dir", "-O", help="transform log output directory", type=str, required=True)
parser.add_argument("--date", "-d", help="date prefix to download", type=str)
args = parser.parse_args()
extract_logs(args.old, args.output_dir, args.date)