forked from RZachLamberty/mtg_data
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathedhrec.py
executable file
·204 lines (157 loc) · 5.88 KB
/
edhrec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Module: edhrec.py
Author: zlamberty
Created: 2018-04-27
Description:
for scraping out edh recommendation information from the edhrec.com website
Usage:
> import edhrec
> edhrec.commanders_and_cards().to_csv('edhrec.csv', index=False)
"""
import argparse
import json
import logging
import logging.config
import os
import yaml
import lxml.html
import pandas as pd
import requests
import mtgconstants
# ----------------------------- #
# Module Constants #
# ----------------------------- #
HERE = os.path.dirname(os.path.realpath(__file__))
LOGGER = logging.getLogger(__name__)
LOGCONF = os.path.join(HERE, 'logging.yaml')
with open(LOGCONF, 'rb') as f:
logging.config.dictConfig(yaml.load(f))
logging.getLogger('urllib3').setLevel(logging.WARNING)
EDH_REC_URL = 'https://edhrec.com/commanders'
F_EDHREC_CACHE = os.path.join(HERE, 'edhrec.csv')
# ----------------------------- #
# Main routine #
# ----------------------------- #
def commanders(baseurl=EDH_REC_URL):
LOGGER.debug('getting commander summary info')
df = pd.concat(
objs=[
_parse_edhrec_cardlist(
url='{}/{}'.format(baseurl, ''.join(color_combo)),
# for partner commanders:
include_multicards=True
)
for color_combo in mtgconstants.ALL_COLOR_COMBOS_W_COLORLESS
],
ignore_index=True
).reset_index(drop=True)
# this will pull in commanders as well as staples. subset to commanders only
df = df[df.cardlist_tag.str.contains('commander')]
df.loc[:, 'num_decks'] = df.label \
.str.extract('(\d+) decks?', expand=False) \
.astype(int)
return df
def commander_summary(commander, baseurl=EDH_REC_URL):
LOGGER.debug('getting info for commander {}'.format(commander))
url = '{}/{}'.format(baseurl, commander)
return _parse_edhrec_cardlist(url)
def commanders_and_cards(baseurl=EDH_REC_URL, forcerefresh=False):
# if the local cache version doesn't exist, or forcerefresh is True, go
# download the information and save it locally. otherwise, just return the
# cached version
if forcerefresh or not os.path.isfile(F_EDHREC_CACHE):
df_commanders = commanders(baseurl)[['name', 'url', 'num_decks']]
df_commanders.loc[:, 'commander_name'] = df_commanders.url.str.extract(
'/commanders/(.*)', expand=False
)
df_commanders.drop(['url'], axis=1, inplace=True)
df_commanders.drop_duplicates(inplace=True)
df = pd.DataFrame()
for (fullname, num_decks, urlname) in df_commanders.values:
dfnow = commander_summary(urlname)
if dfnow.empty:
continue
dfnow = dfnow[['name']]
dfnow.loc[:, 'commander'] = fullname
dfnow.loc[:, 'num_decks'] = num_decks
df = pd.concat([df, dfnow], ignore_index=True)
df = df.reset_index(drop=True)
df.to_csv(F_EDHREC_CACHE, index=False)
return df
else:
return pd.read_csv(F_EDHREC_CACHE)
def _parse_edhrec_cardlist(url, include_multicards=False):
resp = requests.get(url)
root = lxml.html.fromstring(resp.text)
# awesome dirty hack: json already built and embedded
js = [
_.text for _ in root.xpath('.//div[@class="container"]/script')
if _.text and 'json_dict' in _.text
][0]
startstr = 'const json_dict = '
assert js.startswith(startstr)
js = js[len(startstr):-1]
j = json.loads(js)['cardlists']
if j is None:
# no info, empty df is okay
return pd.DataFrame()
if include_multicards:
dfsummary = pd.DataFrame([
{
'cardlist_tag': cardlist['tag'],
'url': cardview['url'],
'label': cardview['label'],
'name': cardview['name'],
'price': cardview.get('cardkingdom', {}).get('price'),
'cardkingdom_url': cardview.get('cardkingdom', {}).get('url'),
'variation': cardview.get('cardkingdom', {}).get('variation'),
'is_commander': card.get('is_commander'),
'is_banned': card.get('is_banned'),
'is_unofficial': card.get('is_unofficial'),
'image': card.get('image'),
}
for cardlist in j
for cardview in cardlist['cardviews']
for card in cardview.get('cards', [])
])
else:
dfsummary = pd.DataFrame([
{
'cardlist_tag': cardlist['tag'],
'url': cardview['url'],
'label': cardview['label'],
'name': cardview['name'],
'price': (cardview.get('cardkingdom', {}) or {}).get('price'),
'url': (cardview.get('cardkingdom', {}) or {}).get('url'),
'variation': (cardview.get('cardkingdom', {}) or {}).get('variation'),
'is_commander': cardview.get('cards', [{}])[0].get('is_commander'),
'is_banned': cardview.get('cards', [{}])[0].get('is_banned'),
'is_unofficial': cardview.get('cards', [{}])[0].get('is_unofficial'),
'image': cardview.get('cards', [{}])[0].get('image'),
}
for cardlist in j
for cardview in cardlist['cardviews']
])
return dfsummary
def main():
"""docstring
args:
returns:
raises:
"""
pass
# ----------------------------- #
# Command line #
# ----------------------------- #
def parse_args():
""" Take a log file from the commmand line """
parser = argparse.ArgumentParser()
parser.add_argument("-x", "--xample", help="An Example", action='store_true')
args = parser.parse_args()
logger.debug("arguments set to {}".format(vars(args)))
return args
if __name__ == '__main__':
args = parse_args()
main()