From e1ec4793aee6a4aec51916ede391adadbf1e2cba Mon Sep 17 00:00:00 2001 From: Jaidev Deshpande Date: Tue, 23 Feb 2021 14:21:28 +0530 Subject: [PATCH 01/28] WIP --- gramex/handlers/__init__.py | 3 +- gramex/handlers/mlhandler.py | 254 +++++++++++++++++++++++++++-------- 2 files changed, 201 insertions(+), 56 deletions(-) diff --git a/gramex/handlers/__init__.py b/gramex/handlers/__init__.py index 9632acfd9..530ee9ab3 100644 --- a/gramex/handlers/__init__.py +++ b/gramex/handlers/__init__.py @@ -16,7 +16,7 @@ from .pptxhandler import PPTXHandler from .proxyhandler import ProxyHandler from .modelhandler import ModelHandler -from .mlhandler import MLHandler +from .mlhandler import MLHandler, TransformersHandler from .filterhandler import FilterHandler from .drivehandler import DriveHandler @@ -59,6 +59,7 @@ 'ModelHandler', 'ML', 'MLHandler', + 'TransformersHandler', 'PPTXHandler', 'ProcessHandler', 'Proxy', diff --git a/gramex/handlers/mlhandler.py b/gramex/handlers/mlhandler.py index a0b162271..83f35f4c8 100644 --- a/gramex/handlers/mlhandler.py +++ b/gramex/handlers/mlhandler.py @@ -16,12 +16,21 @@ import pydoc from sklearn.base import BaseEstimator from sklearn.compose import ColumnTransformer -from sklearn.metrics import accuracy_score +from sklearn.metrics import accuracy_score, roc_auc_score from sklearn.pipeline import Pipeline -from sklearn.preprocessing import OneHotEncoder, StandardScaler +from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder from slugify import slugify from tornado.gen import coroutine from tornado.web import HTTPError +try: + from transformers import pipeline, TextClassificationPipeline + from transformers import AutoModelForSequenceClassification + from transformers import Trainer, TrainingArguments + import torch + TRANSFORMERS_INSTALLED = True +except ImportError: + TRANSFORMERS_INSTALLED = False + op = os.path DATA_CACHE = defaultdict(dict) @@ -38,7 +47,7 @@ 'statsmodels.tsa.api', 'tensorflow.keras.applications' ] -TRAINING_DEFAULTS = { +SKLEARN_DEFAULTS = { 'include': [], 'exclude': [], 'dropna': True, @@ -48,6 +57,14 @@ 'cats': [], 'target_col': None, } +TRANSFORMERS_DEFAULTS = dict( + num_train_epochs=1, + per_device_train_batch_size=16, + per_device_eval_batch_size=32, + weight_decay=0.01, + warmup_steps=100, +) +SENTIMENT_LENC = LabelEncoder().fit(['NEGATIVE', 'POSITIVE']) DEFAULT_TEMPLATE = op.join(op.dirname(__file__), '..', 'apps', 'mlhandler', 'template.html') _prediction_col = '_prediction' @@ -100,7 +117,48 @@ def is_categorical(s, num_treshold=0.1): return True -class MLHandler(FormHandler): +def move_to_cpu(model): + if isinstance(model, TextClassificationPipeline): + model.model.to('cpu') + else: + model.to('cpu') + + +class SentimentDataset(torch.utils.data.Dataset): + def __init__(self, encodings, labels): + self.encodings = encodings + self.labels = labels + + def __getitem__(self, idx): + item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} + item['labels'] = torch.tensor(self.labels[idx]) + return item + + def __len__(self): + return len(self.labels) + + +class BaseMLHandler(FormHandler): + + @classmethod + def setup(cls, data=None, model=None, config_dir='', **kwargs): + cls.slug = slugify(cls.name) + # Create the config store directory + if not op.isdir(config_dir): + config_dir = op.join(gramex.config.variables['GRAMEXDATA'], 'apps', 'mlhandler', + cls.slug) + _mkdir(config_dir) + cls.config_dir = config_dir + cls.config_store = cache.JSONStore(op.join(cls.config_dir, 'config.json'), flush=None) + cls.data_store = op.join(cls.config_dir, 'data.h5') + + # Create the uploads directory + cls.uploads_dir = op.join(config_dir, 'uploads') + _mkdir(cls.uploads_dir) + + cls.template = kwargs.pop('template', True) + + super(BaseMLHandler, cls).setup(**kwargs) @classmethod def store_data(cls, df, append=False): @@ -117,14 +175,14 @@ def load_data(cls): @classmethod def get_opt(cls, key, default=None): - if key in TRAINING_DEFAULTS: - return cls.config_store.load('transform', {}).get(key, TRAINING_DEFAULTS[key]) + if key in SKLEARN_DEFAULTS: + return cls.config_store.load('transform', {}).get(key, SKLEARN_DEFAULTS[key]) if key in ('class', 'params'): return cls.config_store.load('model', {}).get(key, default) @classmethod def set_opt(cls, key, value): - if key in TRAINING_DEFAULTS: + if key in SKLEARN_DEFAULTS: transform = cls.config_store.load('transform', {}) transform[key] = value cls.config_store.dump('transform', transform) @@ -140,20 +198,58 @@ def set_opt(cls, key, value): cls.config_store.changed = True cls.config_store.flush() + def _transform(self, data, **kwargs): + raise NotImplementedError + + def _parse_data(self, _cache=True): + # First look in self.request.files + if len(self.request.files) > 0: + dfs = [] + for _, files in self.request.files.items(): + for f in files: + outpath = op.join(self.uploads_dir, f['filename']) + with open(outpath, 'wb') as fout: + fout.write(f['body']) + if outpath.endswith('.json'): + xdf = cache.open(outpath, pd.read_json) + else: + xdf = cache.open(outpath) + dfs.append(xdf) + os.remove(outpath) + data = pd.concat(dfs, axis=0) + # Otherwise look in request.body + else: + if self.request.headers.get('Content-Type', '') == 'application/json': + try: + data = pd.read_json(self.request.body.decode('utf8')) + except ValueError: + data = self.load_data() + _cache = False + else: + data = pd.DataFrame.from_dict(parse_qs(self.request.body.decode('utf8'))) + if _cache: + self.store_data(data) + if len(data) == 0: + data = self.load_data() + return data + + +class MLHandler(BaseMLHandler): + @classmethod - def setup(cls, data=None, model=None, config_dir='', **kwargs): - cls.slug = slugify(cls.name) - if not op.isdir(config_dir): - config_dir = op.join(gramex.config.variables['GRAMEXDATA'], 'apps', 'mlhandler', - cls.slug) - _mkdir(config_dir) - cls.config_dir = config_dir - cls.uploads_dir = op.join(config_dir, 'uploads') - _mkdir(cls.uploads_dir) - cls.config_store = cache.JSONStore(op.join(cls.config_dir, 'config.json'), flush=None) - cls.data_store = op.join(cls.config_dir, 'data.h5') - cls.template = kwargs.pop('template', True) - super(MLHandler, cls).setup(**kwargs) + def setup(cls, data=None, model=None, backend='sklearn', config_dir='', **kwargs): + + # From filehanlder: do the following + # cls.post = cls.put = cls.delete = cls.patch = cls.options = cls.get + # for clnmame in CLASSES: + # setattr(cls, method) = getattr(clname, method) + super(MLHandler, cls).setup(data, model, config_dir, **kwargs) + # if backend == 'sklearn': + # SklearnHandler.fit(**kwargs) + # elif backend == 'transformers': + # NLPHAndler.fit(**kwargs) + + # Handle data if provided in the YAML config. if isinstance(data, str): data = cache.open(data) elif isinstance(data, dict): @@ -173,7 +269,7 @@ def setup(cls, data=None, model=None, config_dir='', **kwargs): model_path = model.pop('path', default_model_path) # store the model kwargs from gramex.yaml into the store - for key in TRAINING_DEFAULTS: + for key in SKLEARN_DEFAULTS: kwarg = model.get(key, False) if not cls.get_opt(key, False) and kwarg: cls.set_opt(key, kwarg) @@ -342,38 +438,6 @@ def _predict(self, data, score_col=False, transform=True): data[self.get_opt('target_col', _prediction_col)] = self.model.predict(data) return data - def _parse_data(self, _cache=True): - # First look in self.request.files - if len(self.request.files) > 0: - dfs = [] - for _, files in self.request.files.items(): - for f in files: - outpath = op.join(self.uploads_dir, f['filename']) - with open(outpath, 'wb') as fout: - fout.write(f['body']) - if outpath.endswith('.json'): - xdf = cache.open(outpath, pd.read_json) - else: - xdf = cache.open(outpath) - dfs.append(xdf) - os.remove(outpath) - data = pd.concat(dfs, axis=0) - # Otherwise look in request.body - else: - if self.request.headers.get('Content-Type', '') == 'application/json': - try: - data = pd.read_json(self.request.body.decode('utf8')) - except ValueError: - data = self.load_data() - _cache = False - else: - data = pd.DataFrame.from_dict(parse_qs(self.request.body.decode('utf8'))) - if _cache: - self.store_data(data) - if len(data) == 0: - data = self.load_data() - return data - def _coerce_model_params(self, mclass=None, params=None): # If you need params for self.model, use mclass, don't rely on self.model attribute # if self.model: @@ -582,7 +646,7 @@ def put(self, *path_args, **path_kwargs): os.remove(self.model_path) self.set_opt('params', params) - for opt, default in TRAINING_DEFAULTS.items(): + for opt, default in SKLEARN_DEFAULTS.items(): if opt in self.args: val = self.args.pop(opt) if not isinstance(default, list): @@ -598,7 +662,7 @@ def put(self, *path_args, **path_kwargs): def delete(self, *path_args, **path_kwargs): if '_model' in self.args: if '_opts' in self.args: - for k, default in TRAINING_DEFAULTS.items(): + for k, default in SKLEARN_DEFAULTS.items(): if k in self.args: self.set_opt(k, default) elif op.exists(self.model_path): @@ -606,3 +670,83 @@ def delete(self, *path_args, **path_kwargs): self.config_store.purge() if '_cache' in self.args: self.store_data(pd.DataFrame()) + + +class TransformersHandler(BaseMLHandler): + + def _merge_train_opts(self): + kwargs = {k: self.get_arg(k, TRANSFORMERS_DEFAULTS.get(k)) for k in TRANSFORMERS_DEFAULTS} + kwargs = {k: type(TRANSFORMERS_DEFAULTS.get(k))(v) for k, v in kwargs.items()} + return kwargs + + @classmethod + def load_model(cls, task, model): + if model is None: + model = {} + path = model.get('path', False) + if path: + if op.isdir(path): + model = AutoModelForSequenceClassification.from_pretrained(path, device=-1) + else: + model = cache.open(task, pipeline, device=-1) + else: + path = op.join( + gramex.config.variables['GRAMEXDATA'], 'apps', 'mlhandler', + slugify(cls.name), 'model') + model = cache.open(task, pipeline, device=-1) + _mkdir(path) + cls.model_path = path + cls.model = model + + @classmethod + def setup(cls, task, model=None, **kwargs): + if not TRANSFORMERS_INSTALLED: + raise ImportError('pip install transformers') + super(TransformersHandler, cls).setup(**kwargs) + cls.load_model(task, model) + + def _train(self, data): + if isinstance(self.model, TextClassificationPipeline): + model = self.model.model + else: + model = self.model + enc = self.model.tokenizer(data['text'].tolist(), truncation=True, padding=True) + labels = SENTIMENT_LENC.transform(data['label']) + train_dataset = SentimentDataset(enc, labels) + model_output_dir = op.join(op.dirname(self.model_path), 'results') + model_log_dir = op.join(op.dirname(self.model_path), 'logs') + trargs = TrainingArguments( + output_dir=model_output_dir, logging_dir=model_log_dir, **self._merge_train_opts()) + Trainer(model=model, args=trargs, train_dataset=train_dataset).train() + self.model.save_pretrained(self.model_path) + move_to_cpu(self.model) + pred = self._predict(data) + res = { + 'roc_auc': roc_auc_score( + labels, SENTIMENT_LENC.transform([c['label'] for c in pred])) + } + return res + + def _predict(self, data): + return self.model(data['text'].tolist()) + + def _score(self, data): + pred = self._predict(data) + score = roc_auc_score( + *map(SENTIMENT_LENC.transform, (data['label'], [c['label'] for c in pred]))) + return {'roc_auc': score} + + @coroutine + def get(self, *path_args, **path_kwargs): + text = self.get_argument('text') + result = self.model(text) + self.write(json.dumps(result, indent=2)) + + @coroutine + def post(self, *path_args, **path_kwargs): + # Data should always be present as [{'text': ..., 'label': ...}, {'text': ...}] arrays + data = self._parse_data(_cache=False) + action = self.args.get('_action', ['predict'])[0] + move_to_cpu(self.model) + res = yield gramex.service.threadpool.submit(getattr(self, f'_{action}'), data=data) + self.write(json.dumps(res, indent=2, cls=CustomJSONEncoder)) From 4f5b50d0e2198dfa80572a9230283ffeae81e3b2 Mon Sep 17 00:00:00 2001 From: Jaidev Deshpande Date: Wed, 24 Feb 2021 09:29:12 +0530 Subject: [PATCH 02/28] WIP: MLHandler support for Huggingface transformers --- gramex/dl_utils.py | 16 ++ gramex/handlers/__init__.py | 3 +- gramex/handlers/mlhandler.py | 326 ++++++++++++++++++----------------- 3 files changed, 184 insertions(+), 161 deletions(-) create mode 100644 gramex/dl_utils.py diff --git a/gramex/dl_utils.py b/gramex/dl_utils.py new file mode 100644 index 000000000..1028626f0 --- /dev/null +++ b/gramex/dl_utils.py @@ -0,0 +1,16 @@ +from torch.utils.data import Dataset +from torch import tensor + + +class SentimentDataset(Dataset): + def __init__(self, encodings, labels): + self.encodings = encodings + self.labels = labels + + def __getitem__(self, idx): + item = {key: tensor(val[idx]) for key, val in self.encodings.items()} + item['labels'] = tensor(self.labels[idx]) + return item + + def __len__(self): + return len(self.labels) diff --git a/gramex/handlers/__init__.py b/gramex/handlers/__init__.py index 530ee9ab3..9632acfd9 100644 --- a/gramex/handlers/__init__.py +++ b/gramex/handlers/__init__.py @@ -16,7 +16,7 @@ from .pptxhandler import PPTXHandler from .proxyhandler import ProxyHandler from .modelhandler import ModelHandler -from .mlhandler import MLHandler, TransformersHandler +from .mlhandler import MLHandler from .filterhandler import FilterHandler from .drivehandler import DriveHandler @@ -59,7 +59,6 @@ 'ModelHandler', 'ML', 'MLHandler', - 'TransformersHandler', 'PPTXHandler', 'ProcessHandler', 'Proxy', diff --git a/gramex/handlers/mlhandler.py b/gramex/handlers/mlhandler.py index 83f35f4c8..82bb16126 100644 --- a/gramex/handlers/mlhandler.py +++ b/gramex/handlers/mlhandler.py @@ -24,9 +24,9 @@ from tornado.web import HTTPError try: from transformers import pipeline, TextClassificationPipeline - from transformers import AutoModelForSequenceClassification + from transformers import AutoModelForSequenceClassification, AutoTokenizer # NOQA: F401 from transformers import Trainer, TrainingArguments - import torch + from gramex.dl_utils import SentimentDataset TRANSFORMERS_INSTALLED = True except ImportError: TRANSFORMERS_INSTALLED = False @@ -84,6 +84,40 @@ def _fit(model, x, y, path=None, name=None): return model +def _train_transformer(model, data, model_path, **kwargs): + # if isinstance(model, TextClassificationPipeline): + # model = model.model + # else: + # model = model + enc = model.tokenizer(data['text'].tolist(), truncation=True, padding=True) + labels = SENTIMENT_LENC.transform(data['label']) + train_dataset = SentimentDataset(enc, labels) + model_output_dir = op.join(op.dirname(model_path), 'results') + model_log_dir = op.join(op.dirname(model_path), 'logs') + trargs = TrainingArguments( + output_dir=model_output_dir, logging_dir=model_log_dir, **kwargs) + Trainer(model=model.model, args=trargs, train_dataset=train_dataset).train() + model.save_pretrained(model_path) + move_to_cpu(model) + pred = _predict_transformer(model, data) + res = { + 'roc_auc': roc_auc_score( + labels, SENTIMENT_LENC.transform([c['label'] for c in pred])) + } + return res + + +def _predict_transformer(model, data): + return model(data['text'].tolist()) + + +def _score_transformer(model, data): + pred = _predict_transformer(model, data) + score = roc_auc_score( + *map(SENTIMENT_LENC.transform, (data['label'], [c['label'] for c in pred]))) + return {'roc_auc': score} + + def search_modelclass(mclass): for module in MLCLASS_MODULES: cls = pydoc.locate(f'{module}.{mclass}') @@ -124,20 +158,6 @@ def move_to_cpu(model): model.to('cpu') -class SentimentDataset(torch.utils.data.Dataset): - def __init__(self, encodings, labels): - self.encodings = encodings - self.labels = labels - - def __getitem__(self, idx): - item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} - item['labels'] = torch.tensor(self.labels[idx]) - return item - - def __len__(self): - return len(self.labels) - - class BaseMLHandler(FormHandler): @classmethod @@ -233,6 +253,33 @@ def _parse_data(self, _cache=True): data = self.load_data() return data + def _coerce_transformers_opts(self): + kwargs = {k: self.get_arg(k, TRANSFORMERS_DEFAULTS.get(k)) for k in TRANSFORMERS_DEFAULTS} + kwargs = {k: type(TRANSFORMERS_DEFAULTS.get(k))(v) for k, v in kwargs.items()} + return kwargs + + @classmethod + def load_transformer(cls, task, model): + if model is None: + model = {} + path = model.get('path', False) + # if path: + # if op.isdir(path): + # model = AutoModelForSequenceClassification.from_pretrained(path, device=-1) + # tokenizer = AutoTokenizer.from_pretrained(path, device=-1) + # model = pipeline(task, model=model, tokenizer=tokenizer, device=-1) + # else: + # model = cache.open(task, pipeline, device=-1) + # else: + if not path: + path = op.join( + gramex.config.variables['GRAMEXDATA'], 'apps', 'mlhandler', + slugify(cls.name), 'model') + model = cache.open(task, pipeline, device=-1) + _mkdir(path) + cls.model_path = path + cls.model = model + class MLHandler(BaseMLHandler): @@ -243,95 +290,103 @@ def setup(cls, data=None, model=None, backend='sklearn', config_dir='', **kwargs # cls.post = cls.put = cls.delete = cls.patch = cls.options = cls.get # for clnmame in CLASSES: # setattr(cls, method) = getattr(clname, method) - super(MLHandler, cls).setup(data, model, config_dir, **kwargs) + task = kwargs.pop('task', False) # if backend == 'sklearn': # SklearnHandler.fit(**kwargs) # elif backend == 'transformers': - # NLPHAndler.fit(**kwargs) - - # Handle data if provided in the YAML config. - if isinstance(data, str): - data = cache.open(data) - elif isinstance(data, dict): - data = gdata.filter(**data) + # NLPHandler.fit(**kwargs) + if backend != 'sklearn': + if not TRANSFORMERS_INSTALLED: + raise ImportError('pip install transformers') + super(MLHandler, cls).setup(**kwargs) + cls.load_transformer(task, model) + cls.get = NLPHandler.get + cls.post = NLPHandler.post else: - data = None - if data is not None: - cls.store_data(data) - - # parse model kwargs - if model is None: - model = {} - - default_model_path = op.join( - gramex.config.variables['GRAMEXDATA'], 'apps', 'mlhandler', - slugify(cls.name) + '.pkl') - model_path = model.pop('path', default_model_path) - - # store the model kwargs from gramex.yaml into the store - for key in SKLEARN_DEFAULTS: - kwarg = model.get(key, False) - if not cls.get_opt(key, False) and kwarg: - cls.set_opt(key, kwarg) - if op.exists(model_path): # If the pkl exists, load it - cls.model = joblib.load(model_path) - cls.model_path = model_path - target_col = model.get('target_col', False) - if target_col: - cls.set_opt('target_col', target_col) + super(MLHandler, cls).setup(data, model, config_dir, **kwargs) + # Handle data if provided in the YAML config. + if isinstance(data, str): + data = cache.open(data) + elif isinstance(data, dict): + data = gdata.filter(**data) else: - target_col = cls.get_opt('target_col') - else: # build the model - mclass = cls.get_opt('class', model.get('class', False)) - params = cls.get_opt('params', {}) - if not params: - params = model.get('params', {}) - if mclass: - cls.model = search_modelclass(mclass)(**params) - cls.set_opt('class', mclass) - else: - cls.model = None - # Params MUST come after class, or they will be ignored. - cls.set_opt('params', params) + data = None + if data is not None: + cls.store_data(data) - if model_path: # if a path is specified, use to to store the model - cls.model_path = model_path - else: # or create our own path - cls.model_path = default_model_path - _mkdir(op.dirname(cls.model_path)) + # parse model kwargs + if model is None: + model = {} - # train the model - target_col = model.get('target_col', False) - if target_col: - cls.set_opt('target_col', target_col) - else: - target_col = cls.get_opt('target_col', False) - if cls.model is not None and not target_col: - app_log.warning('Target column not defined. Nothing to do.') - else: - if cls.model is not None: - if data is not None: - # filter columns - data = cls._filtercols(data) - - # filter rows - data = cls._filterrows(data) - - # assemble the pipeline - if model.get('pipeline', True): - cls.model = cls._get_pipeline(data) - else: - cls.model = search_modelclass(mclass)(**params) - - # train the model - target = data[target_col] - train = data[[c for c in data if c != target_col]] - if model.get('async', True): - gramex.service.threadpool.submit( - _fit, cls.model, train, target, cls.model_path, cls.name) - else: - _fit(cls.model, train, target, cls.model_path, cls.name) - cls.config_store.flush() + default_model_path = op.join( + gramex.config.variables['GRAMEXDATA'], 'apps', 'mlhandler', + slugify(cls.name) + '.pkl') + model_path = model.pop('path', default_model_path) + + # store the model kwargs from gramex.yaml into the store + for key in SKLEARN_DEFAULTS: + kwarg = model.get(key, False) + if not cls.get_opt(key, False) and kwarg: + cls.set_opt(key, kwarg) + if op.exists(model_path): # If the pkl exists, load it + cls.model = joblib.load(model_path) + cls.model_path = model_path + target_col = model.get('target_col', False) + if target_col: + cls.set_opt('target_col', target_col) + else: + target_col = cls.get_opt('target_col') + else: # build the model + mclass = cls.get_opt('class', model.get('class', False)) + params = cls.get_opt('params', {}) + if not params: + params = model.get('params', {}) + if mclass: + cls.model = search_modelclass(mclass)(**params) + cls.set_opt('class', mclass) + else: + cls.model = None + # Params MUST come after class, or they will be ignored. + cls.set_opt('params', params) + + if model_path: # if a path is specified, use to to store the model + cls.model_path = model_path + else: # or create our own path + cls.model_path = default_model_path + _mkdir(op.dirname(cls.model_path)) + + # train the model + target_col = model.get('target_col', False) + if target_col: + cls.set_opt('target_col', target_col) + else: + target_col = cls.get_opt('target_col', False) + if cls.model is not None and not target_col: + app_log.warning('Target column not defined. Nothing to do.') + else: + if cls.model is not None: + if data is not None: + # filter columns + data = cls._filtercols(data) + + # filter rows + data = cls._filterrows(data) + + # assemble the pipeline + if model.get('pipeline', True): + cls.model = cls._get_pipeline(data) + else: + cls.model = search_modelclass(mclass)(**params) + + # train the model + target = data[target_col] + train = data[[c for c in data if c != target_col]] + if model.get('async', True): + gramex.service.threadpool.submit( + _fit, cls.model, train, target, cls.model_path, cls.name) + else: + _fit(cls.model, train, target, cls.model_path, cls.name) + cls.config_store.flush() @classmethod def _filtercols(cls, data): @@ -672,74 +727,19 @@ def delete(self, *path_args, **path_kwargs): self.store_data(pd.DataFrame()) -class TransformersHandler(BaseMLHandler): - - def _merge_train_opts(self): - kwargs = {k: self.get_arg(k, TRANSFORMERS_DEFAULTS.get(k)) for k in TRANSFORMERS_DEFAULTS} - kwargs = {k: type(TRANSFORMERS_DEFAULTS.get(k))(v) for k, v in kwargs.items()} - return kwargs +class NLPHandler(BaseMLHandler): @classmethod - def load_model(cls, task, model): - if model is None: - model = {} - path = model.get('path', False) - if path: - if op.isdir(path): - model = AutoModelForSequenceClassification.from_pretrained(path, device=-1) - else: - model = cache.open(task, pipeline, device=-1) - else: - path = op.join( - gramex.config.variables['GRAMEXDATA'], 'apps', 'mlhandler', - slugify(cls.name), 'model') - model = cache.open(task, pipeline, device=-1) - _mkdir(path) - cls.model_path = path - cls.model = model - - @classmethod - def setup(cls, task, model=None, **kwargs): + def setup(cls, task, model=None, config_dir='', **kwargs): if not TRANSFORMERS_INSTALLED: raise ImportError('pip install transformers') - super(TransformersHandler, cls).setup(**kwargs) - cls.load_model(task, model) - - def _train(self, data): - if isinstance(self.model, TextClassificationPipeline): - model = self.model.model - else: - model = self.model - enc = self.model.tokenizer(data['text'].tolist(), truncation=True, padding=True) - labels = SENTIMENT_LENC.transform(data['label']) - train_dataset = SentimentDataset(enc, labels) - model_output_dir = op.join(op.dirname(self.model_path), 'results') - model_log_dir = op.join(op.dirname(self.model_path), 'logs') - trargs = TrainingArguments( - output_dir=model_output_dir, logging_dir=model_log_dir, **self._merge_train_opts()) - Trainer(model=model, args=trargs, train_dataset=train_dataset).train() - self.model.save_pretrained(self.model_path) - move_to_cpu(self.model) - pred = self._predict(data) - res = { - 'roc_auc': roc_auc_score( - labels, SENTIMENT_LENC.transform([c['label'] for c in pred])) - } - return res - - def _predict(self, data): - return self.model(data['text'].tolist()) - - def _score(self, data): - pred = self._predict(data) - score = roc_auc_score( - *map(SENTIMENT_LENC.transform, (data['label'], [c['label'] for c in pred]))) - return {'roc_auc': score} + super(NLPHandler, cls).setup(**kwargs) + cls.load_transformer(task, model) @coroutine def get(self, *path_args, **path_kwargs): text = self.get_argument('text') - result = self.model(text) + result = yield gramex.service.threadpool.submit(_predict_transformer, self.model, text) self.write(json.dumps(result, indent=2)) @coroutine @@ -748,5 +748,13 @@ def post(self, *path_args, **path_kwargs): data = self._parse_data(_cache=False) action = self.args.get('_action', ['predict'])[0] move_to_cpu(self.model) - res = yield gramex.service.threadpool.submit(getattr(self, f'_{action}'), data=data) + if action == 'train': + kwargs = self._coerce_transformers_opts() + kwargs['model_path'] = self.model_path + else: + kwargs = {} + res = yield gramex.service.threadpool.submit( + globals()[f'_{action}_transformer'], self.model, data=data, + **kwargs + ) self.write(json.dumps(res, indent=2, cls=CustomJSONEncoder)) From 52e72cb41c4a7ff5af4f440cd81dbc9af42cf8fb Mon Sep 17 00:00:00 2001 From: Jaidev Deshpande Date: Wed, 24 Feb 2021 11:34:16 +0530 Subject: [PATCH 03/28] WIP --- gramex/handlers/mlhandler.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/gramex/handlers/mlhandler.py b/gramex/handlers/mlhandler.py index 82bb16126..ab7ce79ea 100644 --- a/gramex/handlers/mlhandler.py +++ b/gramex/handlers/mlhandler.py @@ -108,7 +108,9 @@ def _train_transformer(model, data, model_path, **kwargs): def _predict_transformer(model, data): - return model(data['text'].tolist()) + if isinstance(data, (str, list)): + return model(data) + return model(data['text'].tolist()) def _score_transformer(model, data): @@ -738,7 +740,7 @@ def setup(cls, task, model=None, config_dir='', **kwargs): @coroutine def get(self, *path_args, **path_kwargs): - text = self.get_argument('text') + text = self.get_arguments('text') result = yield gramex.service.threadpool.submit(_predict_transformer, self.model, text) self.write(json.dumps(result, indent=2)) From 4b016341d9279808155b1bc5899eb8ba6cf3df4d Mon Sep 17 00:00:00 2001 From: Jaidev Deshpande Date: Thu, 25 Feb 2021 10:28:56 +0530 Subject: [PATCH 04/28] WIP --- gramex/handlers/mlhandler.py | 66 ++++++++++++------------------------ tests/test_mlhandler.py | 2 +- 2 files changed, 23 insertions(+), 45 deletions(-) diff --git a/gramex/handlers/mlhandler.py b/gramex/handlers/mlhandler.py index ab7ce79ea..a2af5596d 100644 --- a/gramex/handlers/mlhandler.py +++ b/gramex/handlers/mlhandler.py @@ -43,9 +43,6 @@ 'sklearn.neighbors', 'sklearn.neural_network', 'sklearn.naive_bayes', - 'statsmodels.api', - 'statsmodels.tsa.api', - 'tensorflow.keras.applications' ] SKLEARN_DEFAULTS = { 'include': [], @@ -69,18 +66,12 @@ _prediction_col = '_prediction' -def df2url(df): - s = ['&'.join([f'{k}={v}' for k, v in r.items()]) for r in df.to_dict(orient='records')] - return '&'.join(s) - - def _fit(model, x, y, path=None, name=None): app_log.info('Starting training...') getattr(model, 'partial_fit', model.fit)(x, y) app_log.info('Done training...') - if path: - joblib.dump(model, path) - app_log.info(f'{name}: Model saved at {path}.') + joblib.dump(model, path) + app_log.info(f'{name}: Model saved at {path}.') return model @@ -99,7 +90,7 @@ def _train_transformer(model, data, model_path, **kwargs): Trainer(model=model.model, args=trargs, train_dataset=train_dataset).train() model.save_pretrained(model_path) move_to_cpu(model) - pred = _predict_transformer(model, data) + pred = model(data['text'].tolist()) res = { 'roc_auc': roc_auc_score( labels, SENTIMENT_LENC.transform([c['label'] for c in pred])) @@ -107,14 +98,8 @@ def _train_transformer(model, data, model_path, **kwargs): return res -def _predict_transformer(model, data): - if isinstance(data, (str, list)): - return model(data) - return model(data['text'].tolist()) - - def _score_transformer(model, data): - pred = _predict_transformer(model, data) + pred = model(data['text'].tolist()) score = roc_auc_score( *map(SENTIMENT_LENC.transform, (data['label'], [c['label'] for c in pred]))) return {'roc_auc': score} @@ -147,9 +132,7 @@ def is_categorical(s, num_treshold=0.1): uniques / count <= num_treshold / log(count) """ if pd.api.types.is_numeric_dtype(s): - if s.nunique() / s.shape[0] <= num_treshold: - return True - return False + return s.nunique() / s.shape[0] <= num_treshold return True @@ -383,11 +366,8 @@ def setup(cls, data=None, model=None, backend='sklearn', config_dir='', **kwargs # train the model target = data[target_col] train = data[[c for c in data if c != target_col]] - if model.get('async', True): - gramex.service.threadpool.submit( - _fit, cls.model, train, target, cls.model_path, cls.name) - else: - _fit(cls.model, train, target, cls.model_path, cls.name) + gramex.service.threadpool.submit( + _fit, cls.model, train, target, cls.model_path, cls.name) cls.config_store.flush() @classmethod @@ -671,9 +651,8 @@ def post(self, *path_args, **path_kwargs): # train the model target = data[target_col] train = data[[c for c in data if c != target_col]] - yield gramex.service.threadpool.submit(_fit, self.model, train, target) - # _fit(self.model, train, target) - joblib.dump(self.model, self.model_path) + yield gramex.service.threadpool.submit( + _fit, self.model, train, target, self.model_path) app_log.info(f'{self.name}: Model saved at {self.model_path}') self.write(json.dumps({'score': self.model.score(train, target)})) super(MLHandler, self).post(*path_args, **path_kwargs) @@ -717,14 +696,12 @@ def put(self, *path_args, **path_kwargs): @coroutine def delete(self, *path_args, **path_kwargs): - if '_model' in self.args: - if '_opts' in self.args: - for k, default in SKLEARN_DEFAULTS.items(): - if k in self.args: - self.set_opt(k, default) - elif op.exists(self.model_path): - os.remove(self.model_path) - self.config_store.purge() + if '_model' in self.args and op.exists(self.model_path): + os.remove(self.model_path) + self.config_store.purge() + for opt in self.get_arguments('_opts'): + if opt in SKLEARN_DEFAULTS: + self.set_opt(opt, SKLEARN_DEFAULTS[opt]) if '_cache' in self.args: self.store_data(pd.DataFrame()) @@ -741,7 +718,7 @@ def setup(cls, task, model=None, config_dir='', **kwargs): @coroutine def get(self, *path_args, **path_kwargs): text = self.get_arguments('text') - result = yield gramex.service.threadpool.submit(_predict_transformer, self.model, text) + result = yield gramex.service.threadpool.submit(self.model, text) self.write(json.dumps(result, indent=2)) @coroutine @@ -750,13 +727,14 @@ def post(self, *path_args, **path_kwargs): data = self._parse_data(_cache=False) action = self.args.get('_action', ['predict'])[0] move_to_cpu(self.model) + kwargs = {} if action == 'train': kwargs = self._coerce_transformers_opts() kwargs['model_path'] = self.model_path + args = _train_transformer, self.model, data + elif action == 'score': + args = _score_transformer, self.model, data else: - kwargs = {} - res = yield gramex.service.threadpool.submit( - globals()[f'_{action}_transformer'], self.model, data=data, - **kwargs - ) + args = self.model, data['text'].tolist() + res = yield gramex.service.threadpool.submit(*args, **kwargs) self.write(json.dumps(res, indent=2, cls=CustomJSONEncoder)) diff --git a/tests/test_mlhandler.py b/tests/test_mlhandler.py index 78f26c340..2d17a8df1 100644 --- a/tests/test_mlhandler.py +++ b/tests/test_mlhandler.py @@ -231,7 +231,7 @@ def test_filtercols(self): pipe = joblib.load(op.join(folder, 'model.pkl')) self.assertEqual(pipe.named_steps['LogisticRegression'].coef_.shape, (3, 1)) finally: - self.get('/mlhandler?_model&_opts&include&exclude', method='delete') + self.get('/mlhandler?&_model&_opts=include&_opts=exclude', method='delete') joblib.dump(clf, op.join(folder, 'model.pkl')) def test_get_bulk_predictions(self, target_col='species'): From 1f85677bee68a6c3b07ed4e289f07e059ea73eb7 Mon Sep 17 00:00:00 2001 From: Jaidev Deshpande Date: Fri, 26 Feb 2021 09:15:39 +0530 Subject: [PATCH 05/28] Ensure label IDs are long ints --- gramex/dl_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gramex/dl_utils.py b/gramex/dl_utils.py index 1028626f0..064b7d157 100644 --- a/gramex/dl_utils.py +++ b/gramex/dl_utils.py @@ -1,5 +1,5 @@ from torch.utils.data import Dataset -from torch import tensor +import torch class SentimentDataset(Dataset): @@ -8,8 +8,8 @@ def __init__(self, encodings, labels): self.labels = labels def __getitem__(self, idx): - item = {key: tensor(val[idx]) for key, val in self.encodings.items()} - item['labels'] = tensor(self.labels[idx]) + item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} + item['labels'] = torch.tensor(self.labels[idx]).to(torch.int64) return item def __len__(self): From 6af50ea3ff0f73137afec4c51f34b7ee3456d7b5 Mon Sep 17 00:00:00 2001 From: Jaidev Deshpande Date: Mon, 1 Mar 2021 13:53:24 +0530 Subject: [PATCH 06/28] FIX: Enforce correct order of features before prediction in MLHandler Required for fixing #377 Since v0.24, sklearn's column transformers need the same order of feature names between .fit and .predict. We can still send URL parameters in any order, but they need to be ordered correctly by the MLHandler. See sklearn's release notes for more: https://scikit-learn.org/stable/whats_new/v0.24.html#sklearn-compose --- gramex/handlers/mlhandler.py | 2 ++ tests/test_mlhandler.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/gramex/handlers/mlhandler.py b/gramex/handlers/mlhandler.py index a0b162271..2c5cc7f60 100644 --- a/gramex/handlers/mlhandler.py +++ b/gramex/handlers/mlhandler.py @@ -339,6 +339,8 @@ def _predict(self, data, score_col=False, transform=True): target = data[score_col] data = data.drop([score_col], axis=1) return self.model.score(data, target) + # Set data in the same order as the transformer requests + data = data[self.model.named_steps['transform']._feature_names_in] data[self.get_opt('target_col', _prediction_col)] = self.model.predict(data) return data diff --git a/tests/test_mlhandler.py b/tests/test_mlhandler.py index 78f26c340..2fc067b63 100644 --- a/tests/test_mlhandler.py +++ b/tests/test_mlhandler.py @@ -265,6 +265,8 @@ def test_get_predictions(self, target_col='species'): 'petal_length': 5.1, 'petal_width': 1.8, target_col: 'virginica'} ]) + resp = self.get( + '/mlhandler?sepal_width=3&petal_length=5.1&sepal_length=5.9&petal_width=1.8') req = '/mlhandler?' samples = [] target = [] From a3fa51f03e134ea4639a9121d64ef452df1f1e84 Mon Sep 17 00:00:00 2001 From: Jaidev Deshpande Date: Mon, 1 Mar 2021 14:06:08 +0530 Subject: [PATCH 07/28] WIP --- gramex/handlers/mlhandler.py | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/gramex/handlers/mlhandler.py b/gramex/handlers/mlhandler.py index a2af5596d..e81fc877f 100644 --- a/gramex/handlers/mlhandler.py +++ b/gramex/handlers/mlhandler.py @@ -2,6 +2,7 @@ from inspect import signature import json import os +from shutil import rmtree from urllib.parse import parse_qs import gramex @@ -248,20 +249,8 @@ def load_transformer(cls, task, model): if model is None: model = {} path = model.get('path', False) - # if path: - # if op.isdir(path): - # model = AutoModelForSequenceClassification.from_pretrained(path, device=-1) - # tokenizer = AutoTokenizer.from_pretrained(path, device=-1) - # model = pipeline(task, model=model, tokenizer=tokenizer, device=-1) - # else: - # model = cache.open(task, pipeline, device=-1) - # else: if not path: - path = op.join( - gramex.config.variables['GRAMEXDATA'], 'apps', 'mlhandler', - slugify(cls.name), 'model') - model = cache.open(task, pipeline, device=-1) - _mkdir(path) + path = None cls.model_path = path cls.model = model @@ -287,6 +276,7 @@ def setup(cls, data=None, model=None, backend='sklearn', config_dir='', **kwargs cls.load_transformer(task, model) cls.get = NLPHandler.get cls.post = NLPHandler.post + cls.delete = NLPHandler.delete else: super(MLHandler, cls).setup(data, model, config_dir, **kwargs) # Handle data if provided in the YAML config. @@ -738,3 +728,7 @@ def post(self, *path_args, **path_kwargs): args = self.model, data['text'].tolist() res = yield gramex.service.threadpool.submit(*args, **kwargs) self.write(json.dumps(res, indent=2, cls=CustomJSONEncoder)) + + @coroutine + def delete(self, *path_args, **path_kwargs): + rmtree(self.model_path) From e6be0d5d53a7d68bbaf219397a885713b63a63a9 Mon Sep 17 00:00:00 2001 From: Jaidev Deshpande Date: Tue, 2 Mar 2021 10:44:54 +0530 Subject: [PATCH 08/28] WIP: MLHandler Refactoring --- gramex/handlers/mlhandler.py | 291 ++++++++++++----------------------- tests/test_mlhandler.py | 54 ++++--- 2 files changed, 130 insertions(+), 215 deletions(-) diff --git a/gramex/handlers/mlhandler.py b/gramex/handlers/mlhandler.py index 9cb957ee0..01cda68bb 100644 --- a/gramex/handlers/mlhandler.py +++ b/gramex/handlers/mlhandler.py @@ -1,7 +1,9 @@ from collections import defaultdict from inspect import signature +from io import BytesIO import json import os +import re from shutil import rmtree from urllib.parse import parse_qs @@ -15,16 +17,15 @@ import joblib import pandas as pd import pydoc -from sklearn.base import BaseEstimator from sklearn.compose import ColumnTransformer -from sklearn.metrics import accuracy_score, roc_auc_score +from sklearn.metrics import roc_auc_score from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder from slugify import slugify from tornado.gen import coroutine from tornado.web import HTTPError try: - from transformers import pipeline, TextClassificationPipeline + from transformers import pipeline, TextClassificationPipeline # NOQA: F401 from transformers import AutoModelForSequenceClassification, AutoTokenizer # NOQA: F401 from transformers import Trainer, TrainingArguments from gramex.dl_utils import SentimentDataset @@ -150,34 +151,37 @@ class BaseMLHandler(FormHandler): def setup(cls, data=None, model=None, config_dir='', **kwargs): cls.slug = slugify(cls.name) # Create the config store directory - if not op.isdir(config_dir): + if not config_dir: config_dir = op.join(gramex.config.variables['GRAMEXDATA'], 'apps', 'mlhandler', cls.slug) - _mkdir(config_dir) + _mkdir(config_dir) cls.config_dir = config_dir cls.config_store = cache.JSONStore(op.join(cls.config_dir, 'config.json'), flush=None) cls.data_store = op.join(cls.config_dir, 'data.h5') - # Create the uploads directory - cls.uploads_dir = op.join(config_dir, 'uploads') - _mkdir(cls.uploads_dir) - - cls.template = kwargs.pop('template', True) + template = kwargs.pop('template', False) + if not op.isfile(template): + template = DEFAULT_TEMPLATE + cls.template = template super(BaseMLHandler, cls).setup(**kwargs) @classmethod def store_data(cls, df, append=False): - if op.exists(cls.data_store) and append: - df = pd.concat((pd.read_hdf(cls.data_store, 'data'), df), axis=0, ignore_index=True) - df.to_hdf(cls.data_store, 'data') - return df + df.to_hdf(cls.data_store, format="table", key="data", append=append) + try: + rdf = gramex.cache.open(cls.data_store, key="data") + except KeyError: + rdf = df + return rdf @classmethod def load_data(cls): - if op.exists(cls.data_store): - return gramex.cache.open(cls.data_store) - return pd.DataFrame() + try: + df = gramex.cache.open(cls.data_store, key="data") + except (KeyError, FileNotFoundError): + df = pd.DataFrame() + return df @classmethod def get_opt(cls, key, default=None): @@ -213,15 +217,16 @@ def _parse_data(self, _cache=True): dfs = [] for _, files in self.request.files.items(): for f in files: - outpath = op.join(self.uploads_dir, f['filename']) - with open(outpath, 'wb') as fout: - fout.write(f['body']) - if outpath.endswith('.json'): - xdf = cache.open(outpath, pd.read_json) - else: - xdf = cache.open(outpath) + buff = BytesIO(f['body']) + try: + ext = re.sub('^\.', '', op.splitext(f['filename'])[-1]) + if ext == 'json': + xdf = pd.read_json(buff) + else: + xdf = cache.open_callback[ext](buff) + except KeyError: + raise HTTPError(BAD_REQUEST, reason=f"File extension {ext} not supported.") dfs.append(xdf) - os.remove(outpath) data = pd.concat(dfs, axis=0) # Otherwise look in request.body else: @@ -390,17 +395,14 @@ def _get_pipeline(cls, data, force=False): # If the model exists, return it if op.exists(cls.model_path) and not force: return joblib.load(cls.model_path) - # If there's no data, return None - if data is None or not len(data): - return None + # Else assemble the model nums = set(cls.get_opt('nums', [])) cats = set(cls.get_opt('cats', [])) both = nums.intersection(cats) if len(both) > 0: - raise HTTPError( - BAD_REQUEST, - reason=f"Columns {both} cannot be both numerical and categorical.") + raise HTTPError(BAD_REQUEST, + reason=f"Columns {both} cannot be both numerical and categorical.") to_guess = set(data.columns.tolist()) - nums.union(cats) target_col = cls.get_opt('target_col', False) if target_col: @@ -412,6 +414,7 @@ def _get_pipeline(cls, data, force=False): categoricals += list(cats) numericals += list(nums) assert len(set(categoricals) & set(numericals)) == 0 + steps = [] if categoricals: steps.append(('ohe', OneHotEncoder(sparse=False), categoricals)) @@ -422,6 +425,7 @@ def _get_pipeline(cls, data, force=False): mclass = model_kwargs.get('class', False) if mclass: model = search_modelclass(mclass)(**model_kwargs.get('params', {})) + cls.set_opt('params', model.get_params()) return Pipeline([('transform', ct), (model.__class__.__name__, model)]) return cls.model @@ -429,29 +433,8 @@ def _transform(self, data, **kwargs): orgdata = self.load_data() for col in data: data[col] = data[col].astype(orgdata[col].dtype) - # transform columns - include = self.get_opt('include', kwargs.get('include', [])) - if include: - data = data[include] - exclude = self.get_opt('exclude', kwargs.get('exclude', [])) - to_exclude = [c for c in exclude if c in data] - if to_exclude: - data = data.drop(to_exclude, axis=1) - # transform rows - dropna = self.get_opt('dropna', kwargs.get('dropna', True)) - if dropna: - if isinstance(dropna, list) and len(dropna) > 0: - subset = dropna - else: - subset = None - data.dropna(subset=subset, inplace=True) - dedup = self.get_opt('deduplicate', kwargs.get('deduplicate', True)) - if dedup: - if isinstance(dedup, list): - subset = dedup - else: - subset = None - data.drop_duplicates(subset=subset, inplace=True) + data = self._filtercols(data) + data = self._filterrows(data) return data def _predict(self, data, score_col=False, transform=True): @@ -467,123 +450,60 @@ def _predict(self, data, score_col=False, transform=True): data[self.get_opt('target_col', _prediction_col)] = self.model.predict(data) return data - def _coerce_model_params(self, mclass=None, params=None): - # If you need params for self.model, use mclass, don't rely on self.model attribute - # if self.model: - # model_params = self.model.get_params() - # else: - spec = signature(mclass) - m_args = spec.parameters.keys() - if 'self' in m_args: - m_args.remove('self') - m_defaults = {k: v.default for k, v in spec.parameters.items()} - model_params = {k: v for k, v in zip(m_args, m_defaults)} - if not params: - new_params = {k: v[0] for k, v in self.args.items() if k in model_params} - else: - new_params = params - param_types = {} - for k, v in model_params.items(): - if v is None: - param_types[k] = str - else: - param_types[k] = type(v) - return {k: param_types[k](v) for k, v in new_params.items()} - - def _check_model_path(self, error='raise'): + def _check_model_path(self): if not op.exists(self.model_path): msg = f'No model found at {self.model_path}' - if error == 'raise': - raise HTTPError(NOT_FOUND, log_message=msg) - else: - import warnings - warnings.warn(msg) + raise HTTPError(NOT_FOUND, log_message=msg) if self.model is None: self.model = cache.open(self.model_path, joblib.load) @coroutine def get(self, *path_args, **path_kwargs): - if '_download' in self.args: - self.set_header('Content-Type', 'application/octet-strem') - self.set_header('Content-Disposition', - f'attachment; filename={op.basename(self.model_path)}') - self.write(open(self.model_path, 'rb').read()) - elif '_model' in self.args: - self._check_model_path() - if isinstance(self.model, Pipeline): - for k, v in self.model.named_steps.items(): - if k != 'transform': - break - params = v.get_params() - elif isinstance(self.model, BaseEstimator): - params = self.model.get_params() - elif self.model is None: - params = self.get_opt('params') - self.write(json.dumps(params, indent=4)) + if '_params' in self.args: + params = { + 'opts': self.config_store.load('transform'), + 'params': self.config_store.load('model') + } + self.write(json.dumps(params, indent=2)) elif '_cache' in self.args: - if '_opts' in self.args: - self.write(json.dumps(self.config_store.load('transform'))) - self.finish() - elif '_params' in self.args: - self.write(json.dumps(self.config_store.load('model'))) - self.finish() - else: - data = self.load_data() - if len(data): - self.write(data.to_json(orient='records')) - else: - self.write(json.dumps([])) + self.write(self.load_data().to_json(orient='records')) else: self._check_model_path() - self.set_header('Content-Type', 'application/json') - action = self.args.pop('_action', [''])[0] - try: - data = pd.DataFrame.from_dict( - {k: v for k, v in self.args.items() if not k.startswith('_')}) - if len(data) > 0 and not action: - action = 'predict' - except Exception as err: - app_log.debug(err.msg) - data = self.load_data() - if len(data) == 0: - data = self.load_data() - target_col = self.get_opt('target_col') - if target_col in data: - target = data[target_col] - to_predict = data.drop([target_col], axis=1) - else: - target = None - to_predict = data - if action in ('predict', 'score'): - prediction = yield gramex.service.threadpool.submit( - self._predict, to_predict) - if action == 'predict': - self.write(json.dumps(prediction, indent=4, cls=CustomJSONEncoder)) - elif action == 'score': - prediction = prediction[target_col if target_col else _prediction_col] - score = accuracy_score(target.astype(prediction.dtype), - prediction) - self.write(json.dumps({'score': score}, indent=4)) + if '_download' in self.args: + self.set_header('Content-Type', 'application/octet-strem') + self.set_header('Content-Disposition', + f'attachment; filename={op.basename(self.model_path)}') + with open(self.model_path, 'rb') as fout: + self.write(fout.read()) + elif '_model' in self.args: + self.write(json.dumps(self.get_opt('params'), indent=2)) + else: - if isinstance(self.template, str) and op.isfile(self.template): - self.set_header('Content-Type', 'text/html') - # return Template(self.template) - self.render( - self.template, handler=self, - data=self.load_data()) - elif self.template: - self.set_header('Content-Type', 'text/html') - self.render(DEFAULT_TEMPLATE, handler=self, data=self.load_data()) - else: + try: + data = pd.DataFrame.from_dict( + {k: v for k, v in self.args.items() if not k.startswith('_')}) + except Exception as err: + app_log.debug(err.msg) + data = [] + if len(data) > 0: self.set_header('Content-Type', 'application/json') - self.write(json.dumps([])) + target_col = self.get_opt('target_col') + if target_col in data: + data = data.drop([target_col], axis=1) + # if action in ('predict', 'score'): + prediction = yield gramex.service.threadpool.submit( + self._predict, data) + self.write(json.dumps(prediction, indent=2, cls=CustomJSONEncoder)) + else: + self.set_header('Content-Type', 'text/html') + self.render(self.template, handler=self, data=self.load_data()) super(MLHandler, self).get(*path_args, **path_kwargs) @coroutine def post(self, *path_args, **path_kwargs): action = self.args.get('_action', ['predict']) if not set(action).issubset({'predict', 'score', 'append', 'train', 'retrain'}): - raise ValueError(f'Action {action} not supported.') + raise ValueError(f'Action(s) {action} not supported.') if len(action) == 1: action = action[0] @@ -601,7 +521,7 @@ def post(self, *path_args, **path_kwargs): if action == 'predict': prediction = yield gramex.service.threadpool.submit( self._predict, data) - self.write(json.dumps(prediction, indent=4, cls=CustomJSONEncoder)) + self.write(json.dumps(prediction, indent=2, cls=CustomJSONEncoder)) elif action == 'score': target_col = self.get_opt('target_col') if target_col is None: @@ -609,7 +529,7 @@ def post(self, *path_args, **path_kwargs): self.set_opt('target_col', target_col) score = yield gramex.service.threadpool.submit( self._predict, data, target_col, transform=False) - self.write(json.dumps({'score': score}, indent=4)) + self.write(json.dumps({'score': score}, indent=2)) elif (action == 'append') or ('append' in action): try: data = self.store_data(data, append=True) @@ -631,10 +551,7 @@ def post(self, *path_args, **path_kwargs): else: self.set_opt('target_col', target_col) - # filter columns data = self._filtercols(data) - - # filter rows data = self._filterrows(data) # assemble the pipeline @@ -651,40 +568,30 @@ def post(self, *path_args, **path_kwargs): @coroutine def put(self, *path_args, **path_kwargs): - if '_model' in self.args: - self.args.pop('_model') - mclass = self.args.pop('class', [False])[0] - if mclass: - self.set_opt('class', mclass) - else: - mclass = self.get_opt('class') - params = self.get_opt('params', {}) - if mclass is not None: - # parse the params as the signature dictates - for param in signature(search_modelclass(mclass)).parameters: - if param in self.args: - value = self.args.pop(param) - if len(value) == 1: - value = value[0] - params[param] = value - - # Since model params are changing, remove the model on disk - self.model = None - if op.exists(self.model_path): - os.remove(self.model_path) - self.set_opt('params', params) - - for opt, default in SKLEARN_DEFAULTS.items(): - if opt in self.args: - val = self.args.pop(opt) - if not isinstance(default, list): - if isinstance(val, list) and len(val) == 1: - val = val[0] - self.set_opt(opt, val) - - self.config_store.flush() - else: - self._check_model_path() + mclass = self.args.pop('class', [self.get_opt('class')])[0] + self.set_opt('class', mclass) + params = self.get_opt('params', {}) + if mclass: + # parse the params as the signature dictates + for param in signature(search_modelclass(mclass)).parameters: + if param in self.args: + value = self.args.pop(param) + if len(value) == 1: + value = value[0] + params[param] = value + # Since model params are changing, remove the model on disk + self.model = None + if op.exists(self.model_path): + os.remove(self.model_path) + self.set_opt('params', params) + for opt, default in SKLEARN_DEFAULTS.items(): + if opt in self.args: + val = self.args.pop(opt) + if not isinstance(default, list): + if isinstance(val, list) and len(val) == 1: + val = val[0] + self.set_opt(opt, val) + self.config_store.flush() @coroutine def delete(self, *path_args, **path_kwargs): diff --git a/tests/test_mlhandler.py b/tests/test_mlhandler.py index d6394d131..152bc1486 100644 --- a/tests/test_mlhandler.py +++ b/tests/test_mlhandler.py @@ -49,7 +49,7 @@ def test_append(self): data=self.df.to_json(orient='records'), headers={'Content-Type': 'application/json'}) self.assertEqual(r.status_code, OK) - df = pd.DataFrame.from_records(self.get('/mlhandler?_cache=true').json()) + df = pd.DataFrame.from_records(self.get('/mlhandler?_cache').json()) self.assertEqual(df.shape[0], 2 * self.df.shape[0]) finally: self.get('/mlhandler?_cache', method='delete') @@ -61,7 +61,7 @@ def test_append_train(self): df_train = self.df[self.df['species'] != 'virginica'] df_append = self.df[self.df['species'] == 'virginica'] - resp = self.get('/mlincr?_model&class=LogisticRegression&target_col=species', method='put') + resp = self.get('/mlincr?class=LogisticRegression&target_col=species', method='put') self.assertEqual(resp.status_code, OK) resp = self.get( @@ -96,20 +96,21 @@ def test_blank_slate(self): r = self.get('/mlblank?sepal_length=5.9&sepal_width=3&petal_length=5.1&petal_width=1.8') self.assertEqual(r.status_code, NOT_FOUND) # Post options in any order, randomly - r = self.get('/mlblank?_model&target_col=species', method='put') + r = self.get('/mlblank?target_col=species', method='put') self.assertEqual(r.status_code, OK) - r = self.get('/mlblank?_model&exclude=petal_width', method='put') + r = self.get('/mlblank?exclude=petal_width', method='put') self.assertEqual(r.status_code, OK) - r = self.get('/mlblank?_model&nums=sepal_length&nums=sepal_width&nums=petal_length', + r = self.get('/mlblank?nums=sepal_length&nums=sepal_width&nums=petal_length', method='put') self.assertEqual(r.status_code, OK) - r = self.get('/mlblank?_model&class=LogisticRegression', method='put') + r = self.get('/mlblank?class=LogisticRegression', method='put') self.assertEqual(r.status_code, OK) # check the training opts + params = self.get('/mlblank?_params').json() self.assertDictEqual( - self.get('/mlblank?_cache&_opts').json(), + params['opts'], { 'target_col': 'species', 'exclude': ['petal_width'], @@ -117,7 +118,7 @@ def test_blank_slate(self): } ) self.assertDictEqual( - self.get('/mlblank?_cache&_params').json(), + params['params'], { 'class': 'LogisticRegression', 'params': {} @@ -130,11 +131,11 @@ def test_change_model(self): try: # put a new model r = self.get( - '/mlhandler?_model&class=DecisionTreeClassifier&criterion=entropy&splitter=random', + '/mlhandler?class=DecisionTreeClassifier&criterion=entropy&splitter=random', method='put') self.assertEqual(r.status_code, OK) - r = self.get('/mlhandler?_cache&_params') - self.assertDictEqual(r.json(), { + r = self.get('/mlhandler?_params') + self.assertDictEqual(r.json()['params'], { 'class': 'DecisionTreeClassifier', 'params': { 'criterion': 'entropy', @@ -204,7 +205,7 @@ def test_filtercols(self): buff.seek(0) clf = joblib.load(op.join(folder, 'model.pkl')) try: - resp = self.get('/mlhandler?_model&class=LogisticRegression&target_col=species' + resp = self.get('/mlhandler?class=LogisticRegression&target_col=species' '&exclude=sepal_width&exclude=petal_length', method='put') resp = self.get('/mlhandler?_action=retrain', @@ -223,7 +224,7 @@ def test_filtercols(self): # Train including one column: buff.seek(0) - self.get('/mlhandler?_model&include=sepal_width', method='put') + self.get('/mlhandler?include=sepal_width', method='put') resp = self.get('/mlhandler?_action=retrain', method='post', files={'file': ('iris.csv', buff.read())}) self.assertGreaterEqual(resp.json()['score'], 0.5) @@ -267,6 +268,11 @@ def test_get_predictions(self, target_col='species'): ]) resp = self.get( '/mlhandler?sepal_width=3&petal_length=5.1&sepal_length=5.9&petal_width=1.8') + self.assertEqual(resp.json(), [ + {'sepal_length': 5.9, 'sepal_width': 3.0, + 'petal_length': 5.1, 'petal_width': 1.8, + target_col: 'virginica'} + ]) req = '/mlhandler?' samples = [] target = [] @@ -289,14 +295,16 @@ def test_get_predictions_post_file(self): pred = pd.DataFrame.from_records(resp.json())['species'] self.assertGreaterEqual(accuracy_score(target, pred), self.ACC_TOL) - def test_get_score(self): - req = '/mlhandler?_action=score&' - samples = [] - for row in self.df.sample(n=5).to_dict(orient='records'): - samples.extend([(col, value) for col, value in row.items()]) - params = '&'.join([f'{k}={v}' for k, v in samples]) - resp = self.get(req + params) - self.assertGreaterEqual(resp.json()['score'], 0.6) # NOQA: E912 + def test_get_predictions_post_json_file(self): + df = self.df.drop_duplicates() + target = df.pop('species') + buff = StringIO() + df.to_json(buff, orient='records') + buff.seek(0) + resp = self.get('/mlhandler?_action=predict', + method='post', files={'file': ('iris.json', buff)}) + pred = pd.DataFrame.from_records(resp.json())['species'] + self.assertGreaterEqual(accuracy_score(target, pred), self.ACC_TOL) def test_model_default_path(self): clf = joblib.load(op.join( @@ -320,7 +328,7 @@ def test_post_after_delete_custom_model(self): xtrain, xtest, ytrain, ytest = train_test_split(X, y, stratify=y, test_size=0.25) df = pd.DataFrame(xtrain) df['target'] = ytrain - r = self.get('/mlhandler?_model&class=GaussianNB', method='put') + r = self.get('/mlhandler?class=GaussianNB', method='put') self.assertEqual(r.status_code, OK) r = self.get('/mlhandler?_action=train&target_col=target', method='post', data=df.to_json(orient='records'), @@ -388,7 +396,7 @@ def test_retrain(self): def test_single_line_train_fetch_model(self): clf = joblib.load(op.join(folder, 'model.pkl')) try: - resp = self.get('/mlblank?_model&class=DecisionTreeClassifier&target_col=species', + resp = self.get('/mlblank?class=DecisionTreeClassifier&target_col=species', method='put') self.assertEqual(resp.status_code, OK) # train From 8317fe11376b7a430ec75cf1ee5deadf21f706c4 Mon Sep 17 00:00:00 2001 From: Jaidev Deshpande Date: Tue, 2 Mar 2021 15:24:28 +0530 Subject: [PATCH 09/28] ENH: Transfomers - model persistence --- gramex/handlers/mlhandler.py | 85 ++++++++++++++++++++++++------------ 1 file changed, 56 insertions(+), 29 deletions(-) diff --git a/gramex/handlers/mlhandler.py b/gramex/handlers/mlhandler.py index 01cda68bb..a790e45a7 100644 --- a/gramex/handlers/mlhandler.py +++ b/gramex/handlers/mlhandler.py @@ -220,10 +220,7 @@ def _parse_data(self, _cache=True): buff = BytesIO(f['body']) try: ext = re.sub('^\.', '', op.splitext(f['filename'])[-1]) - if ext == 'json': - xdf = pd.read_json(buff) - else: - xdf = cache.open_callback[ext](buff) + xdf = cache.open_callback['jsondata' if ext == 'json' else ext](buff) except KeyError: raise HTTPError(BAD_REQUEST, reason=f"File extension {ext} not supported.") dfs.append(xdf) @@ -253,10 +250,20 @@ def _coerce_transformers_opts(self): def load_transformer(cls, task, model): if model is None: model = {} - path = model.get('path', False) - if not path: - path = None + default_model_path = op.join( + gramex.config.variables['GRAMEXDATA'], 'apps', 'mlhandler', + slugify(cls.name)) + path = model.get('path', default_model_path) cls.model_path = path + # try loading from model_path + try: + _model = AutoModelForSequenceClassification.from_pretrained(cls.model_path) + _tokenizer = AutoTokenizer.from_pretrained(cls.model_path) + model = pipeline(task=task, model=_model, tokenizer=_tokenizer) + except Exception as err: + app_log.warning(f'Could not load model from {cls.model_path}.') + app_log.warning(f'{err}') + model = pipeline(task) cls.model = model @@ -348,7 +355,6 @@ def setup(cls, data=None, model=None, backend='sklearn', config_dir='', **kwargs if data is not None: # filter columns data = cls._filtercols(data) - # filter rows data = cls._filterrows(data) @@ -523,10 +529,11 @@ def post(self, *path_args, **path_kwargs): self._predict, data) self.write(json.dumps(prediction, indent=2, cls=CustomJSONEncoder)) elif action == 'score': - target_col = self.get_opt('target_col') - if target_col is None: - target_col = self.get_arg('target_col') - self.set_opt('target_col', target_col) + # target_col = self.get_opt('target_col') + # if target_col is None: + # target_col = self.get_arg('target_col') + # self.set_opt('target_col', target_col) + target_col = self.get_cached_arg('target_col') score = yield gramex.service.threadpool.submit( self._predict, data, target_col, transform=False) self.write(json.dumps({'score': score}, indent=2)) @@ -541,15 +548,16 @@ def post(self, *path_args, **path_kwargs): action = action[0] if action in ('train', 'retrain'): - target_col = self.args.get('target_col', [False])[0] - if not target_col: - older_target_col = self.get_opt('target_col', False) - if not older_target_col: - raise ValueError('target_col not specified') - else: - target_col = older_target_col - else: - self.set_opt('target_col', target_col) + # target_col = self.args.get('target_col', [False])[0] + # if not target_col: + # older_target_col = self.get_opt('target_col', False) + # if not older_target_col: + # raise ValueError('target_col not specified') + # else: + # target_col = older_target_col + # else: + # self.set_opt('target_col', target_col) + target_col = self.get_cached_arg('target_col') data = self._filtercols(data) data = self._filterrows(data) @@ -566,6 +574,13 @@ def post(self, *path_args, **path_kwargs): self.write(json.dumps({'score': self.model.score(train, target)})) super(MLHandler, self).post(*path_args, **path_kwargs) + def get_cached_arg(self, argname): + val = self.get_arg(argname, False) + if not val: + return self.get_opt(val) + self.set_opt(argname, val) + return val + @coroutine def put(self, *path_args, **path_kwargs): mclass = self.args.pop('class', [self.get_opt('class')])[0] @@ -593,16 +608,28 @@ def put(self, *path_args, **path_kwargs): self.set_opt(opt, val) self.config_store.flush() + def _delete_model(self): + pass + + def _delete_opts(self): + pass + + def _delete_cache(self): + pass + @coroutine def delete(self, *path_args, **path_kwargs): - if '_model' in self.args and op.exists(self.model_path): - os.remove(self.model_path) - self.config_store.purge() - for opt in self.get_arguments('_opts'): - if opt in SKLEARN_DEFAULTS: - self.set_opt(opt, SKLEARN_DEFAULTS[opt]) - if '_cache' in self.args: - self.store_data(pd.DataFrame()) + for item in self.get_arguments('delete', []): + getattr(self, f'_delete_{item}')(self) + + # if '_model' in self.args and op.exists(self.model_path): + # os.remove(self.model_path) + # self.config_store.purge() + # for opt in self.get_arguments('_opts'): + # if opt in SKLEARN_DEFAULTS: + # self.set_opt(opt, SKLEARN_DEFAULTS[opt]) + # if '_cache' in self.args: + # self.store_data(pd.DataFrame()) class NLPHandler(BaseMLHandler): From 2d5b2130acae9afef522b9cb5f106cb33c798398 Mon Sep 17 00:00:00 2001 From: Jaidev Deshpande Date: Wed, 3 Mar 2021 17:47:28 +0530 Subject: [PATCH 10/28] WIP --- gramex/handlers/mlhandler.py | 208 +++++++++++++++-------------------- tests/test_mlhandler.py | 24 ++-- 2 files changed, 102 insertions(+), 130 deletions(-) diff --git a/gramex/handlers/mlhandler.py b/gramex/handlers/mlhandler.py index a790e45a7..744fe96d2 100644 --- a/gramex/handlers/mlhandler.py +++ b/gramex/handlers/mlhandler.py @@ -56,6 +56,7 @@ 'cats': [], 'target_col': None, } +ACTIONS = ['predict', 'score', 'append', 'train', 'retrain'] TRANSFORMERS_DEFAULTS = dict( num_train_epochs=1, per_device_train_batch_size=16, @@ -68,6 +69,14 @@ _prediction_col = '_prediction' +def _remove(path): + if op.exists(path): + if op.isfile(path): + os.remove(path) + elif op.isdir(path): + rmtree(path) + + def _fit(model, x, y, path=None, name=None): app_log.info('Starting training...') getattr(model, 'partial_fit', model.fit)(x, y) @@ -78,10 +87,6 @@ def _fit(model, x, y, path=None, name=None): def _train_transformer(model, data, model_path, **kwargs): - # if isinstance(model, TextClassificationPipeline): - # model = model.model - # else: - # model = model enc = model.tokenizer(data['text'].tolist(), truncation=True, padding=True) labels = SENTIMENT_LENC.transform(data['label']) train_dataset = SentimentDataset(enc, labels) @@ -139,10 +144,7 @@ def is_categorical(s, num_treshold=0.1): def move_to_cpu(model): - if isinstance(model, TextClassificationPipeline): - model.model.to('cpu') - else: - model.to('cpu') + getattr(model, 'model', model).to('cpu') class BaseMLHandler(FormHandler): @@ -159,11 +161,7 @@ def setup(cls, data=None, model=None, config_dir='', **kwargs): cls.config_store = cache.JSONStore(op.join(cls.config_dir, 'config.json'), flush=None) cls.data_store = op.join(cls.config_dir, 'data.h5') - template = kwargs.pop('template', False) - if not op.isfile(template): - template = DEFAULT_TEMPLATE - cls.template = template - + cls.template = kwargs.pop('template', DEFAULT_TEMPLATE) super(BaseMLHandler, cls).setup(**kwargs) @classmethod @@ -211,7 +209,7 @@ def set_opt(cls, key, value): def _transform(self, data, **kwargs): raise NotImplementedError - def _parse_data(self, _cache=True): + def _parse_data(self, _cache=True, append=False): # First look in self.request.files if len(self.request.files) > 0: dfs = [] @@ -236,7 +234,7 @@ def _parse_data(self, _cache=True): else: data = pd.DataFrame.from_dict(parse_qs(self.request.body.decode('utf8'))) if _cache: - self.store_data(data) + self.store_data(data, append) if len(data) == 0: data = self.load_data() return data @@ -247,30 +245,30 @@ def _coerce_transformers_opts(self): return kwargs @classmethod - def load_transformer(cls, task, model): - if model is None: - model = {} + def load_transformer(cls, task, _model={}): default_model_path = op.join( gramex.config.variables['GRAMEXDATA'], 'apps', 'mlhandler', slugify(cls.name)) - path = model.get('path', default_model_path) + path = _model.get('path', default_model_path) cls.model_path = path # try loading from model_path + kwargs = {} + if task == "ner": + kwargs['grouped_entities'] = True try: - _model = AutoModelForSequenceClassification.from_pretrained(cls.model_path) - _tokenizer = AutoTokenizer.from_pretrained(cls.model_path) - model = pipeline(task=task, model=_model, tokenizer=_tokenizer) + kwargs['model'] = AutoModelForSequenceClassification.from_pretrained(cls.model_path) + kwargs['tokenizer'] = AutoTokenizer.from_pretrained(cls.model_path) except Exception as err: app_log.warning(f'Could not load model from {cls.model_path}.') app_log.warning(f'{err}') - model = pipeline(task) + model = pipeline(task, **kwargs) cls.model = model class MLHandler(BaseMLHandler): @classmethod - def setup(cls, data=None, model=None, backend='sklearn', config_dir='', **kwargs): + def setup(cls, data=None, model={}, backend='sklearn', config_dir='', **kwargs): # From filehanlder: do the following # cls.post = cls.put = cls.delete = cls.patch = cls.options = cls.get @@ -301,10 +299,6 @@ def setup(cls, data=None, model=None, backend='sklearn', config_dir='', **kwargs if data is not None: cls.store_data(data) - # parse model kwargs - if model is None: - model = {} - default_model_path = op.join( gramex.config.variables['GRAMEXDATA'], 'apps', 'mlhandler', slugify(cls.name) + '.pkl') @@ -412,7 +406,10 @@ def _get_pipeline(cls, data, force=False): to_guess = set(data.columns.tolist()) - nums.union(cats) target_col = cls.get_opt('target_col', False) if target_col: - to_guess = to_guess - {target_col} + try: + to_guess = to_guess - {target_col} + except TypeError: + app_log.critical(target_col) categoricals = [c for c in to_guess if is_categorical(data[c])] for c in categoricals: to_guess.remove(c) @@ -443,7 +440,9 @@ def _transform(self, data, **kwargs): data = self._filterrows(data) return data - def _predict(self, data, score_col=False, transform=True): + def _predict(self, data=None, score_col=False, transform=True): + if data is None: + data = self._parse_data(False) if transform: data = self._transform(data, deduplicate=False) self.model = cache.open(self.model_path, joblib.load) @@ -505,79 +504,42 @@ def get(self, *path_args, **path_kwargs): self.render(self.template, handler=self, data=self.load_data()) super(MLHandler, self).get(*path_args, **path_kwargs) - @coroutine - def post(self, *path_args, **path_kwargs): - action = self.args.get('_action', ['predict']) - if not set(action).issubset({'predict', 'score', 'append', 'train', 'retrain'}): - raise ValueError(f'Action(s) {action} not supported.') - if len(action) == 1: - action = action[0] + def _append(self): + self._parse_data(_cache=True, append=True) - if action in ('score', 'predict'): - self._check_model_path() - if action == 'retrain': - # Don't parse data from request, just train on the cached data - data = self.load_data() - else: - data = self._parse_data(False) - - if (action == 'score') & (len(data) == 0): - data = self.load_data() + def _train(self, data=None): + target_col = self.get_argument('target_col', self.get_opt('target_col')) + self.set_opt('target_col', target_col) + data = self._parse_data(False) if data is None else data + data = self._filtercols(data) + data = self._filterrows(data) + target = data[target_col] + train = data[[c for c in data if c != target_col]] + self.model = self._get_pipeline(data, force=True) + _fit(self.model, train, target, self.model_path) + return {'score': self.model.score(train, target)} + + def _retrain(self): + return self._train(self.load_data()) + + def _score(self): + self._check_model_path() + data = self._parse_data(False) + target_col = self.get_argument('target_col', self.get_opt('target_col')) + self.set_opt('target_col', target_col) + return {'score': self._predict(data, target_col, transform=False)} - if action == 'predict': - prediction = yield gramex.service.threadpool.submit( - self._predict, data) - self.write(json.dumps(prediction, indent=2, cls=CustomJSONEncoder)) - elif action == 'score': - # target_col = self.get_opt('target_col') - # if target_col is None: - # target_col = self.get_arg('target_col') - # self.set_opt('target_col', target_col) - target_col = self.get_cached_arg('target_col') - score = yield gramex.service.threadpool.submit( - self._predict, data, target_col, transform=False) - self.write(json.dumps({'score': score}, indent=2)) - elif (action == 'append') or ('append' in action): - try: - data = self.store_data(data, append=True) - except Exception as err: - raise HTTPError(BAD_REQUEST, reason=f'{err}') - if isinstance(action, list) and ('append' in action): - action.remove('append') - if len(action) == 1: - action = action[0] - - if action in ('train', 'retrain'): - # target_col = self.args.get('target_col', [False])[0] - # if not target_col: - # older_target_col = self.get_opt('target_col', False) - # if not older_target_col: - # raise ValueError('target_col not specified') - # else: - # target_col = older_target_col - # else: - # self.set_opt('target_col', target_col) - target_col = self.get_cached_arg('target_col') - - data = self._filtercols(data) - data = self._filterrows(data) - - # assemble the pipeline - if self.get_opt('pipeline', True): - self.model = self._get_pipeline(data, force=True) - # train the model - target = data[target_col] - train = data[[c for c in data if c != target_col]] - yield gramex.service.threadpool.submit( - _fit, self.model, train, target, self.model_path) - app_log.info(f'{self.name}: Model saved at {self.model_path}') - self.write(json.dumps({'score': self.model.score(train, target)})) + @coroutine + def post(self, *path_args, **path_kwargs): + action = self.args.pop('_action', ['predict'])[0] + if action not in ACTIONS: + raise ValueError(f'Action {action} not supported.') + res = yield gramex.service.threadpool.submit(getattr(self, f"_{action}")) + self.write(json.dumps(res, indent=2, cls=CustomJSONEncoder)) super(MLHandler, self).post(*path_args, **path_kwargs) def get_cached_arg(self, argname): - val = self.get_arg(argname, False) - if not val: - return self.get_opt(val) + val = self.get_arg(argname, self.get_opt(argname)) self.set_opt(argname, val) return val @@ -596,8 +558,7 @@ def put(self, *path_args, **path_kwargs): params[param] = value # Since model params are changing, remove the model on disk self.model = None - if op.exists(self.model_path): - os.remove(self.model_path) + _remove(self.model_path) self.set_opt('params', params) for opt, default in SKLEARN_DEFAULTS.items(): if opt in self.args: @@ -609,33 +570,31 @@ def put(self, *path_args, **path_kwargs): self.config_store.flush() def _delete_model(self): - pass - - def _delete_opts(self): - pass + _remove(self.model_path) + self.config_store.purge() def _delete_cache(self): - pass + self.store_data(pd.DataFrame()) + + def _delete_opts(self): + for opt in self.get_arguments('_opts'): + if opt in SKLEARN_DEFAULTS: + self.set_opt(opt, SKLEARN_DEFAULTS[opt]) @coroutine def delete(self, *path_args, **path_kwargs): - for item in self.get_arguments('delete', []): - getattr(self, f'_delete_{item}')(self) - - # if '_model' in self.args and op.exists(self.model_path): - # os.remove(self.model_path) - # self.config_store.purge() - # for opt in self.get_arguments('_opts'): - # if opt in SKLEARN_DEFAULTS: - # self.set_opt(opt, SKLEARN_DEFAULTS[opt]) - # if '_cache' in self.args: - # self.store_data(pd.DataFrame()) + for item in self.get_arguments('delete'): + try: + getattr(self, f'_delete_{item}')() + except AttributeError: + raise HTTPError(BAD_REQUEST, f'Cannot delete {item}.') class NLPHandler(BaseMLHandler): @classmethod - def setup(cls, task, model=None, config_dir='', **kwargs): + def setup(cls, task, model={}, config_dir='', **kwargs): + cls.task = task if not TRANSFORMERS_INSTALLED: raise ImportError('pip install transformers') super(NLPHandler, cls).setup(**kwargs) @@ -655,16 +614,25 @@ def post(self, *path_args, **path_kwargs): move_to_cpu(self.model) kwargs = {} if action == 'train': + if self.task == "ner": + raise HTTPError(BAD_REQUEST, + reason="Action not yet supported for task {self.task}") kwargs = self._coerce_transformers_opts() kwargs['model_path'] = self.model_path args = _train_transformer, self.model, data elif action == 'score': + if self.task == "ner": + raise HTTPError(BAD_REQUEST, + reason="Action not yet supported for task {self.task}") args = _score_transformer, self.model, data - else: + elif self.task == "sentiment-analysis": args = self.model, data['text'].tolist() - res = yield gramex.service.threadpool.submit(*args, **kwargs) + res = yield gramex.service.threadpool.submit(*args, **kwargs) + elif self.task == "ner": + res = yield gramex.service.threadpool.submit(lambda x: [self.model(k) for k in x], + data['text'].tolist()) self.write(json.dumps(res, indent=2, cls=CustomJSONEncoder)) @coroutine def delete(self, *path_args, **path_kwargs): - rmtree(self.model_path) + _remove(self.model_path) diff --git a/tests/test_mlhandler.py b/tests/test_mlhandler.py index 152bc1486..cf6f77f7e 100644 --- a/tests/test_mlhandler.py +++ b/tests/test_mlhandler.py @@ -52,7 +52,7 @@ def test_append(self): df = pd.DataFrame.from_records(self.get('/mlhandler?_cache').json()) self.assertEqual(df.shape[0], 2 * self.df.shape[0]) finally: - self.get('/mlhandler?_cache', method='delete') + self.get('/mlhandler?delete=cache', method='delete') self.get('/mlhandler?_action=append', method='post', data=self.df.to_json(orient='records'), headers={'Content-Type': 'application/json'}) @@ -65,10 +65,12 @@ def test_append_train(self): self.assertEqual(resp.status_code, OK) resp = self.get( - '/mlincr?_action=append&_action=train', method='post', + '/mlincr?_action=append', method='post', data=df_train.to_json(orient='records'), headers={'Content-Type': 'application/json'}) self.assertEqual(resp.status_code, OK) + resp = self.get('/mlincr?_action=train', method='post') + self.assertEqual(resp.status_code, OK) resp = self.get( '/mlincr?_action=score', method='post', @@ -77,10 +79,12 @@ def test_append_train(self): org_score = resp.json()['score'] resp = self.get( - '/mlincr?_action=append&_action=train', method='post', + '/mlincr?_action=append', method='post', data=df_append.to_json(orient='records'), headers={'Content-Type': 'application/json'}) self.assertEqual(resp.status_code, OK) + resp = self.get('/mlincr?_action=train', method='post') + self.assertEqual(resp.status_code, OK) resp = self.get( '/mlincr?_action=score', method='post', data=self.df.to_json(orient='records'), @@ -159,7 +163,7 @@ def test_change_model(self): def test_clear_cache(self): try: - r = self.get('/mlhandler?_cache', method='delete') + r = self.get('/mlhandler?delete=cache', method='delete') self.assertEqual(r.status_code, OK) self.assertListEqual(self.get('/mlhandler?_cache').json(), []) finally: @@ -179,7 +183,7 @@ def test_default(self): def test_delete(self): clf = joblib.load(op.join(folder, 'model.pkl')) try: - r = self.get('/mlhandler?_model', method='delete') + r = self.get('/mlhandler?delete=model', method='delete') self.assertEqual(r.status_code, OK) self.assertFalse(op.exists(op.join(folder, 'model.pkl'))) # check if the correct error message is shown @@ -232,7 +236,7 @@ def test_filtercols(self): pipe = joblib.load(op.join(folder, 'model.pkl')) self.assertEqual(pipe.named_steps['LogisticRegression'].coef_.shape, (3, 1)) finally: - self.get('/mlhandler?&_model&_opts=include&_opts=exclude', method='delete') + self.get('/mlhandler?delete=opts&_opts=include&_opts=exclude', method='delete') joblib.dump(clf, op.join(folder, 'model.pkl')) def test_get_bulk_predictions(self, target_col='species'): @@ -320,7 +324,7 @@ def test_model_default_path(self): def test_post_after_delete_custom_model(self): org_clf = joblib.load(op.join(folder, 'model.pkl')) try: - r = self.get('/mlhandler?_model', method='delete') + r = self.get('/mlhandler?delete=model', method='delete') self.assertEqual(r.status_code, OK) self.assertFalse(op.exists(op.join(folder, 'model.pkl'))) # recreate the model @@ -343,7 +347,7 @@ def test_post_after_delete_custom_model(self): def test_post_after_delete_default_model(self): clf = joblib.load(op.join(folder, 'model.pkl')) try: - r = self.get('/mlhandler?_model', method='delete') + r = self.get('/mlhandler?delete=model', method='delete') self.assertEqual(r.status_code, OK) self.assertFalse(op.exists(op.join(folder, 'model.pkl'))) # recreate the model @@ -370,7 +374,7 @@ def test_retrain(self): clf = joblib.load(op.join(folder, 'model.pkl')) try: # clear the cache - resp = self.get('/mlhandler?_cache', method='delete') + resp = self.get('/mlhandler?delete=cache', method='delete') self.assertEqual(resp.status_code, OK) resp = self.get('/mlhandler?_cache') self.assertListEqual(resp.json(), []) @@ -387,7 +391,7 @@ def test_retrain(self): self.assertGreaterEqual(resp.json()['score'], 0.6) # NOQA: E912 finally: # revert to the original cache - self.get('/mlhandler?_cache', method='delete') + self.get('/mlhandler?delete=cache', method='delete') self.get('/mlhandler?_action=append', method='post', data=self.df.to_json(orient='records'), headers={'Content-Type': 'application/json'}) From 37759399ca0efcb9c65305db78a66b1cf5de3fc7 Mon Sep 17 00:00:00 2001 From: Jaidev Deshpande Date: Fri, 5 Mar 2021 09:28:12 +0530 Subject: [PATCH 11/28] WIP --- gramex/handlers/mlhandler.py | 134 +++++++++++++++-------------------- tests/test_mlhandler.py | 2 +- 2 files changed, 60 insertions(+), 76 deletions(-) diff --git a/gramex/handlers/mlhandler.py b/gramex/handlers/mlhandler.py index 744fe96d2..206c356e4 100644 --- a/gramex/handlers/mlhandler.py +++ b/gramex/handlers/mlhandler.py @@ -1,4 +1,3 @@ -from collections import defaultdict from inspect import signature from io import BytesIO import json @@ -25,18 +24,15 @@ from tornado.gen import coroutine from tornado.web import HTTPError try: - from transformers import pipeline, TextClassificationPipeline # NOQA: F401 - from transformers import AutoModelForSequenceClassification, AutoTokenizer # NOQA: F401 + from transformers import pipeline + from transformers import AutoModelForSequenceClassification, AutoTokenizer from transformers import Trainer, TrainingArguments from gramex.dl_utils import SentimentDataset - TRANSFORMERS_INSTALLED = True except ImportError: TRANSFORMERS_INSTALLED = False op = os.path -DATA_CACHE = defaultdict(dict) -SCORES = defaultdict(list) MLCLASS_MODULES = [ 'sklearn.linear_model', 'sklearn.tree', @@ -46,7 +42,7 @@ 'sklearn.neural_network', 'sklearn.naive_bayes', ] -SKLEARN_DEFAULTS = { +TRANSFORMS = { 'include': [], 'exclude': [], 'dropna': True, @@ -66,7 +62,6 @@ ) SENTIMENT_LENC = LabelEncoder().fit(['NEGATIVE', 'POSITIVE']) DEFAULT_TEMPLATE = op.join(op.dirname(__file__), '..', 'apps', 'mlhandler', 'template.html') -_prediction_col = '_prediction' def _remove(path): @@ -113,14 +108,10 @@ def _score_transformer(model, data): def search_modelclass(mclass): - for module in MLCLASS_MODULES: + for module in MLCLASS_MODULES + [mclass]: cls = pydoc.locate(f'{module}.{mclass}') if cls: return cls - # Search with the literal path - cls = pydoc.locate(mclass) - if cls: - return cls msg = f'Model {mclass} not found. Please provide a full Python path.' raise HTTPError(NOT_FOUND, reason=msg) @@ -183,14 +174,15 @@ def load_data(cls): @classmethod def get_opt(cls, key, default=None): - if key in SKLEARN_DEFAULTS: - return cls.config_store.load('transform', {}).get(key, SKLEARN_DEFAULTS[key]) - if key in ('class', 'params'): - return cls.config_store.load('model', {}).get(key, default) + return cls.config_store.load('transform', {}).get( + key, TRANSFORMS.get( + key, cls.config_store.load('model', {}).get(key, default) + ) + ) @classmethod def set_opt(cls, key, value): - if key in SKLEARN_DEFAULTS: + if key in TRANSFORMS: transform = cls.config_store.load('transform', {}) transform[key] = value cls.config_store.dump('transform', transform) @@ -206,9 +198,6 @@ def set_opt(cls, key, value): cls.config_store.changed = True cls.config_store.flush() - def _transform(self, data, **kwargs): - raise NotImplementedError - def _parse_data(self, _cache=True, append=False): # First look in self.request.files if len(self.request.files) > 0: @@ -233,10 +222,10 @@ def _parse_data(self, _cache=True, append=False): _cache = False else: data = pd.DataFrame.from_dict(parse_qs(self.request.body.decode('utf8'))) - if _cache: - self.store_data(data, append) if len(data) == 0: data = self.load_data() + elif _cache: + self.store_data(data, append) return data def _coerce_transformers_opts(self): @@ -305,7 +294,7 @@ def setup(cls, data=None, model={}, backend='sklearn', config_dir='', **kwargs): model_path = model.pop('path', default_model_path) # store the model kwargs from gramex.yaml into the store - for key in SKLEARN_DEFAULTS: + for key in TRANSFORMS: kwarg = model.get(key, False) if not cls.get_opt(key, False) and kwarg: cls.set_opt(key, kwarg) @@ -383,10 +372,7 @@ def _filterrows(cls, data): for method in 'dropna drop_duplicates'.split(): action = cls.get_opt(method, True) if action: - if isinstance(action, list): - subset = action - else: - subset = None + subset = action if isinstance(action, list) else None data = getattr(data, method)(subset=subset) return data @@ -403,13 +389,7 @@ def _get_pipeline(cls, data, force=False): if len(both) > 0: raise HTTPError(BAD_REQUEST, reason=f"Columns {both} cannot be both numerical and categorical.") - to_guess = set(data.columns.tolist()) - nums.union(cats) - target_col = cls.get_opt('target_col', False) - if target_col: - try: - to_guess = to_guess - {target_col} - except TypeError: - app_log.critical(target_col) + to_guess = set(data.columns.tolist()) - nums.union(cats) - {cls.get_opt('target_col')} categoricals = [c for c in to_guess if is_categorical(data[c])] for c in categoricals: to_guess.remove(c) @@ -418,12 +398,10 @@ def _get_pipeline(cls, data, force=False): numericals += list(nums) assert len(set(categoricals) & set(numericals)) == 0 - steps = [] - if categoricals: - steps.append(('ohe', OneHotEncoder(sparse=False), categoricals)) - if numericals: - steps.append(('scaler', StandardScaler(), numericals)) - ct = ColumnTransformer(steps) + ct = ColumnTransformer( + [('ohe', OneHotEncoder(sparse=False), categoricals), + ('scaler', StandardScaler(), numericals)] + ) model_kwargs = cls.config_store.load('model', {}) mclass = model_kwargs.get('class', False) if mclass: @@ -440,27 +418,34 @@ def _transform(self, data, **kwargs): data = self._filterrows(data) return data - def _predict(self, data=None, score_col=False, transform=True): + def _predict(self, data=None, score_col=''): if data is None: data = self._parse_data(False) - if transform: - data = self._transform(data, deduplicate=False) + data = self._transform(data, deduplicate=False) self.model = cache.open(self.model_path, joblib.load) - if score_col and score_col in data: - target = data[score_col] - data = data.drop([score_col], axis=1) + try: + target = data.pop(score_col) return self.model.score(data, target) - # Set data in the same order as the transformer requests - data = data[self.model.named_steps['transform']._feature_names_in] - data[self.get_opt('target_col', _prediction_col)] = self.model.predict(data) - return data + except KeyError: + # Set data in the same order as the transformer requests + data = data[self.model.named_steps['transform']._feature_names_in] + data[self.get_opt('target_col', '_prediction')] = self.model.predict(data) + return data def _check_model_path(self): - if not op.exists(self.model_path): - msg = f'No model found at {self.model_path}' - raise HTTPError(NOT_FOUND, log_message=msg) - if self.model is None: + try: self.model = cache.open(self.model_path, joblib.load) + except FileNotFoundError: + raise HTTPError(NOT_FOUND, f'No model found at {self.model_path}') + + @coroutine + def prepare(self): + flattened = {} + for k, v in self.args.items(): + if not isinstance(TRANSFORMS.get(k), list) and isinstance(v, list) and len(v) == 1: + v = v[0] + flattened[k] = v + self.args = flattened @coroutine def get(self, *path_args, **path_kwargs): @@ -485,16 +470,17 @@ def get(self, *path_args, **path_kwargs): else: try: - data = pd.DataFrame.from_dict( - {k: v for k, v in self.args.items() if not k.startswith('_')}) + data_args = {k: v for k, v in self.args.items() if not k.startswith('_')} + data_args = { + k: [v] if not isinstance(v, list) else v for k, v in data_args.items() + } + data = pd.DataFrame.from_dict(data_args) except Exception as err: app_log.debug(err.msg) data = [] if len(data) > 0: self.set_header('Content-Type', 'application/json') - target_col = self.get_opt('target_col') - if target_col in data: - data = data.drop([target_col], axis=1) + data = data.drop([self.get_opt('target_col')], axis=1, errors='ignore') # if action in ('predict', 'score'): prediction = yield gramex.service.threadpool.submit( self._predict, data) @@ -527,11 +513,11 @@ def _score(self): data = self._parse_data(False) target_col = self.get_argument('target_col', self.get_opt('target_col')) self.set_opt('target_col', target_col) - return {'score': self._predict(data, target_col, transform=False)} + return {'score': self._predict(data, target_col)} @coroutine def post(self, *path_args, **path_kwargs): - action = self.args.pop('_action', ['predict'])[0] + action = self.args.pop('_action', 'predict') if action not in ACTIONS: raise ValueError(f'Action {action} not supported.') res = yield gramex.service.threadpool.submit(getattr(self, f"_{action}")) @@ -545,7 +531,7 @@ def get_cached_arg(self, argname): @coroutine def put(self, *path_args, **path_kwargs): - mclass = self.args.pop('class', [self.get_opt('class')])[0] + mclass = self.args.pop('class', self.get_opt('class')) self.set_opt('class', mclass) params = self.get_opt('params', {}) if mclass: @@ -553,20 +539,19 @@ def put(self, *path_args, **path_kwargs): for param in signature(search_modelclass(mclass)).parameters: if param in self.args: value = self.args.pop(param) - if len(value) == 1: - value = value[0] + # if len(value) == 1: + # value = value[0] params[param] = value # Since model params are changing, remove the model on disk self.model = None _remove(self.model_path) self.set_opt('params', params) - for opt, default in SKLEARN_DEFAULTS.items(): - if opt in self.args: - val = self.args.pop(opt) - if not isinstance(default, list): - if isinstance(val, list) and len(val) == 1: - val = val[0] - self.set_opt(opt, val) + for opt in TRANSFORMS.keys() & self.args.keys(): + val = self.args.pop(opt) + # if not isinstance(TRANSFORMS[opt], list): + # if isinstance(val, list) and len(val) == 1: + # val = val[0] + self.set_opt(opt, val) self.config_store.flush() def _delete_model(self): @@ -577,9 +562,8 @@ def _delete_cache(self): self.store_data(pd.DataFrame()) def _delete_opts(self): - for opt in self.get_arguments('_opts'): - if opt in SKLEARN_DEFAULTS: - self.set_opt(opt, SKLEARN_DEFAULTS[opt]) + for opt in set(self.get_arguments('_opts')) & TRANSFORMS.keys(): + self.set_opt(opt, TRANSFORMS[opt]) @coroutine def delete(self, *path_args, **path_kwargs): diff --git a/tests/test_mlhandler.py b/tests/test_mlhandler.py index cf6f77f7e..fc48580c1 100644 --- a/tests/test_mlhandler.py +++ b/tests/test_mlhandler.py @@ -91,7 +91,7 @@ def test_append_train(self): headers={'Content-Type': 'application/json'}) new_score = resp.json()['score'] # Score should improve by at least 30% - self.assertGreaterEqual(new_score - org_score, 0.3) # NOQA: E912 + self.assertGreaterEqual(new_score - org_score, 0.29) # NOQA: E912 def test_blank_slate(self): # Assert that a model doesn't have to exist From d446803e0e4cc7d486102cc70b393f776f3d039d Mon Sep 17 00:00:00 2001 From: Jaidev Deshpande Date: Tue, 9 Mar 2021 08:59:59 +0530 Subject: [PATCH 12/28] ENH: Modify gramex.install.safe_rmtree to remove files outside $GRAMEXDATA --- gramex/handlers/mlhandler.py | 19 +++++--------- gramex/install.py | 48 ++++++++++++++++++++++-------------- 2 files changed, 35 insertions(+), 32 deletions(-) diff --git a/gramex/handlers/mlhandler.py b/gramex/handlers/mlhandler.py index 206c356e4..7b434929f 100644 --- a/gramex/handlers/mlhandler.py +++ b/gramex/handlers/mlhandler.py @@ -3,7 +3,6 @@ import json import os import re -from shutil import rmtree from urllib.parse import parse_qs import gramex @@ -11,7 +10,7 @@ from gramex import data as gdata from gramex.handlers import FormHandler from gramex.http import NOT_FOUND, BAD_REQUEST -from gramex.install import _mkdir +from gramex.install import _mkdir, safe_rmtree from gramex import cache import joblib import pandas as pd @@ -64,14 +63,6 @@ DEFAULT_TEMPLATE = op.join(op.dirname(__file__), '..', 'apps', 'mlhandler', 'template.html') -def _remove(path): - if op.exists(path): - if op.isfile(path): - os.remove(path) - elif op.isdir(path): - rmtree(path) - - def _fit(model, x, y, path=None, name=None): app_log.info('Starting training...') getattr(model, 'partial_fit', model.fit)(x, y) @@ -107,6 +98,7 @@ def _score_transformer(model, data): return {'roc_auc': score} +# ToDo: Use gramex.config.locate def search_modelclass(mclass): for module in MLCLASS_MODULES + [mclass]: cls = pydoc.locate(f'{module}.{mclass}') @@ -144,6 +136,7 @@ class BaseMLHandler(FormHandler): def setup(cls, data=None, model=None, config_dir='', **kwargs): cls.slug = slugify(cls.name) # Create the config store directory + # use config_dir or DEFAULT_VALUE if not config_dir: config_dir = op.join(gramex.config.variables['GRAMEXDATA'], 'apps', 'mlhandler', cls.slug) @@ -544,7 +537,7 @@ def put(self, *path_args, **path_kwargs): params[param] = value # Since model params are changing, remove the model on disk self.model = None - _remove(self.model_path) + safe_rmtree(self.model_path, gramexdata=False) self.set_opt('params', params) for opt in TRANSFORMS.keys() & self.args.keys(): val = self.args.pop(opt) @@ -555,7 +548,7 @@ def put(self, *path_args, **path_kwargs): self.config_store.flush() def _delete_model(self): - _remove(self.model_path) + safe_rmtree(self.model_path, gramexdata=False) self.config_store.purge() def _delete_cache(self): @@ -619,4 +612,4 @@ def post(self, *path_args, **path_kwargs): @coroutine def delete(self, *path_args, **path_kwargs): - _remove(self.model_path) + safe_rmtree(self.model_path, gramexdata=False) diff --git a/gramex/install.py b/gramex/install.py index 9eb31673b..3177ceab2 100644 --- a/gramex/install.py +++ b/gramex/install.py @@ -197,32 +197,42 @@ def _ensure_remove(function, path, exc_info): raise exc_info[1] -def safe_rmtree(target, retries=100, delay=0.05): +def _try_remove(target, retries=100, delay=0.05, func=shutil.rmtree, **kwargs): + for count in range(retries): + try: + func(target, **kwargs) + except TryAgainError: + pass + # If permission is denied, e.g. antivirus, file is open, etc, keep trying with delay + except OSError: + app_log.warning(' Trying again to delete', target) + time.sleep(delay) + else: + break + + +def safe_rmtree(target, retries=100, delay=0.05, gramexdata=True): ''' - A replacement for shutil.rmtree that removes directories within $GRAMEXDATA. + A replacement for shutil.rmtree and os.remove that removes directories, + optionally within $GRAMEXDATA. It tries to remove the target multiple times, recovering from errors. ''' if not os.path.exists(target): return True # TODO: check case insensitive in Windows, but case sensitive on other OS - elif target.lower().startswith(variables['GRAMEXDATA'].lower()): - # Try multiple times to recover from errors, since we have no way of - # auto-resuming rmtree: https://bugs.python.org/issue8523 - for count in range(retries): - try: - shutil.rmtree(target, onerror=_ensure_remove) - except TryAgainError: - pass - # If permission is denied, e.g. antivirus, file is open, etc, keep trying with delay - except OSError: - app_log.warning(' Trying again to delete', target) - time.sleep(delay) - else: - break - return True + func, kwargs = (shutil.rmtree, {'onerror': _ensure_remove}) if \ + os.path.isdir(target) else (os.remove, {}) + if gramexdata: + if target.lower().startswith(variables['GRAMEXDATA'].lower()): + # Try multiple times to recover from errors, since we have no way of + # auto-resuming rmtree: https://bugs.python.org/issue8523 + _try_remove(target, retries, delay, func, **kwargs) + return True + else: + app_log.warning('Not removing directory %s (outside $GRAMEXDATA)', target) + return False else: - app_log.warning('Not removing directory %s (outside $GRAMEXDATA)', target) - return False + _try_remove(target, retries, delay, func, **kwargs) def zip_prefix_filter(members, prefix): From 80b6c27b477560c88ec391771aa6be996ed314ff Mon Sep 17 00:00:00 2001 From: Sanket Verma Date: Mon, 21 Jun 2021 17:52:04 +0530 Subject: [PATCH 13/28] Add code for sentiment analysis and remove print statements --- gramex/handlers/mlhandler.py | 78 ++++++++++++------------------------ 1 file changed, 25 insertions(+), 53 deletions(-) diff --git a/gramex/handlers/mlhandler.py b/gramex/handlers/mlhandler.py index 7b434929f..724356e42 100644 --- a/gramex/handlers/mlhandler.py +++ b/gramex/handlers/mlhandler.py @@ -27,6 +27,7 @@ from transformers import AutoModelForSequenceClassification, AutoTokenizer from transformers import Trainer, TrainingArguments from gramex.dl_utils import SentimentDataset + TRANSFORMERS_INSTALLED = True except ImportError: TRANSFORMERS_INSTALLED = False @@ -248,6 +249,7 @@ def load_transformer(cls, task, _model={}): class MLHandler(BaseMLHandler): + model_backend = "" @classmethod def setup(cls, data=None, model={}, backend='sklearn', config_dir='', **kwargs): @@ -262,13 +264,11 @@ def setup(cls, data=None, model={}, backend='sklearn', config_dir='', **kwargs): # elif backend == 'transformers': # NLPHandler.fit(**kwargs) if backend != 'sklearn': + cls.model_backend = backend if not TRANSFORMERS_INSTALLED: raise ImportError('pip install transformers') super(MLHandler, cls).setup(**kwargs) cls.load_transformer(task, model) - cls.get = NLPHandler.get - cls.post = NLPHandler.post - cls.delete = NLPHandler.delete else: super(MLHandler, cls).setup(data, model, config_dir, **kwargs) # Handle data if provided in the YAML config. @@ -450,6 +450,10 @@ def get(self, *path_args, **path_kwargs): self.write(json.dumps(params, indent=2)) elif '_cache' in self.args: self.write(self.load_data().to_json(orient='records')) + elif 'text' in self.args: + text = self.get_arguments('text') + result = yield gramex.service.threadpool.submit(self.model, text) + self.write(json.dumps(result, indent=2)) else: self._check_model_path() if '_download' in self.args: @@ -513,8 +517,24 @@ def post(self, *path_args, **path_kwargs): action = self.args.pop('_action', 'predict') if action not in ACTIONS: raise ValueError(f'Action {action} not supported.') - res = yield gramex.service.threadpool.submit(getattr(self, f"_{action}")) - self.write(json.dumps(res, indent=2, cls=CustomJSONEncoder)) + if self.model_backend == "transformers": + data = self._parse_data(_cache=False) + move_to_cpu(self.model) + kwargs = {} + if action == 'train': + kwargs = self._coerce_transformers_opts() + kwargs['model_path'] = self.model_path + print(self.model_path) + args = _train_transformer, self.model, data + elif action == 'score': + args = _score_transformer, self.model, data + elif action == 'predict': + args = self.model, data['text'].tolist() + res = yield gramex.service.threadpool.submit(*args, **kwargs) + self.write(json.dumps(res, indent=2, cls=CustomJSONEncoder)) + else: + res = yield gramex.service.threadpool.submit(getattr(self, f"_{action}")) + self.write(json.dumps(res, indent=2, cls=CustomJSONEncoder)) super(MLHandler, self).post(*path_args, **path_kwargs) def get_cached_arg(self, argname): @@ -565,51 +585,3 @@ def delete(self, *path_args, **path_kwargs): getattr(self, f'_delete_{item}')() except AttributeError: raise HTTPError(BAD_REQUEST, f'Cannot delete {item}.') - - -class NLPHandler(BaseMLHandler): - - @classmethod - def setup(cls, task, model={}, config_dir='', **kwargs): - cls.task = task - if not TRANSFORMERS_INSTALLED: - raise ImportError('pip install transformers') - super(NLPHandler, cls).setup(**kwargs) - cls.load_transformer(task, model) - - @coroutine - def get(self, *path_args, **path_kwargs): - text = self.get_arguments('text') - result = yield gramex.service.threadpool.submit(self.model, text) - self.write(json.dumps(result, indent=2)) - - @coroutine - def post(self, *path_args, **path_kwargs): - # Data should always be present as [{'text': ..., 'label': ...}, {'text': ...}] arrays - data = self._parse_data(_cache=False) - action = self.args.get('_action', ['predict'])[0] - move_to_cpu(self.model) - kwargs = {} - if action == 'train': - if self.task == "ner": - raise HTTPError(BAD_REQUEST, - reason="Action not yet supported for task {self.task}") - kwargs = self._coerce_transformers_opts() - kwargs['model_path'] = self.model_path - args = _train_transformer, self.model, data - elif action == 'score': - if self.task == "ner": - raise HTTPError(BAD_REQUEST, - reason="Action not yet supported for task {self.task}") - args = _score_transformer, self.model, data - elif self.task == "sentiment-analysis": - args = self.model, data['text'].tolist() - res = yield gramex.service.threadpool.submit(*args, **kwargs) - elif self.task == "ner": - res = yield gramex.service.threadpool.submit(lambda x: [self.model(k) for k in x], - data['text'].tolist()) - self.write(json.dumps(res, indent=2, cls=CustomJSONEncoder)) - - @coroutine - def delete(self, *path_args, **path_kwargs): - safe_rmtree(self.model_path, gramexdata=False) From 13116a0f41cb4e6a766fc69a5e93656a95469c18 Mon Sep 17 00:00:00 2001 From: Sanket Verma Date: Mon, 21 Jun 2021 19:16:52 +0530 Subject: [PATCH 14/28] Remove print statement --- gramex/handlers/mlhandler.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gramex/handlers/mlhandler.py b/gramex/handlers/mlhandler.py index 724356e42..e34e767c8 100644 --- a/gramex/handlers/mlhandler.py +++ b/gramex/handlers/mlhandler.py @@ -524,7 +524,6 @@ def post(self, *path_args, **path_kwargs): if action == 'train': kwargs = self._coerce_transformers_opts() kwargs['model_path'] = self.model_path - print(self.model_path) args = _train_transformer, self.model, data elif action == 'score': args = _score_transformer, self.model, data From 7f4025843b68dfc14d26725c400a79d3abcae8fd Mon Sep 17 00:00:00 2001 From: Sanket Verma Date: Wed, 7 Jul 2021 02:41:29 +0530 Subject: [PATCH 15/28] Solve merge conflicts --- gramex/handlers/formhandler.py | 1 + gramex/handlers/mlhandler.py | 400 ++++++++++++++------------------- 2 files changed, 166 insertions(+), 235 deletions(-) diff --git a/gramex/handlers/formhandler.py b/gramex/handlers/formhandler.py index 9bf9f4af5..771693fd2 100644 --- a/gramex/handlers/formhandler.py +++ b/gramex/handlers/formhandler.py @@ -53,6 +53,7 @@ def setup(cls, **kwargs): cls.headers = conf_kwargs.pop('headers', {}) # Top level formats: key is special. Don't treat it as data cls.formats = conf_kwargs.pop('formats', {}) + cls.task = conf_kwargs.pop('task', {}) default_config = conf_kwargs.pop('default', None) # Remove other known special keys from dataset configuration cls.clear_special_keys(conf_kwargs) diff --git a/gramex/handlers/mlhandler.py b/gramex/handlers/mlhandler.py index e34e767c8..7dd68d6f4 100644 --- a/gramex/handlers/mlhandler.py +++ b/gramex/handlers/mlhandler.py @@ -3,10 +3,10 @@ import json import os import re -from urllib.parse import parse_qs import gramex -from gramex.config import app_log, CustomJSONEncoder +from gramex.transforms import build_transform +from gramex.config import app_log, CustomJSONEncoder, locate from gramex import data as gdata from gramex.handlers import FormHandler from gramex.http import NOT_FOUND, BAD_REQUEST @@ -14,14 +14,14 @@ from gramex import cache import joblib import pandas as pd -import pydoc from sklearn.compose import ColumnTransformer -from sklearn.metrics import roc_auc_score from sklearn.pipeline import Pipeline +from sklearn.metrics import roc_auc_score from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder from slugify import slugify from tornado.gen import coroutine from tornado.web import HTTPError +from sklearn.metrics import get_scorer try: from transformers import pipeline from transformers import AutoModelForSequenceClassification, AutoTokenizer @@ -31,7 +31,6 @@ except ImportError: TRANSFORMERS_INSTALLED = False - op = os.path MLCLASS_MODULES = [ 'sklearn.linear_model', @@ -46,13 +45,12 @@ 'include': [], 'exclude': [], 'dropna': True, - 'deduplicate': True, + 'drop_duplicates': True, 'pipeline': True, 'nums': [], 'cats': [], - 'target_col': None, + 'target_col': None } -ACTIONS = ['predict', 'score', 'append', 'train', 'retrain'] TRANSFORMERS_DEFAULTS = dict( num_train_epochs=1, per_device_train_batch_size=16, @@ -60,16 +58,21 @@ weight_decay=0.01, warmup_steps=100, ) -SENTIMENT_LENC = LabelEncoder().fit(['NEGATIVE', 'POSITIVE']) +ACTIONS = ['predict', 'score', 'append', 'train', 'retrain'] DEFAULT_TEMPLATE = op.join(op.dirname(__file__), '..', 'apps', 'mlhandler', 'template.html') +SENTIMENT_LENC = LabelEncoder().fit(['NEGATIVE', 'POSITIVE']) +search_modelclass = lambda x: locate(x, MLCLASS_MODULES) # NOQA: E731 def _fit(model, x, y, path=None, name=None): app_log.info('Starting training...') - getattr(model, 'partial_fit', model.fit)(x, y) - app_log.info('Done training...') - joblib.dump(model, path) - app_log.info(f'{name}: Model saved at {path}.') + try: + getattr(model, 'partial_fit', model.fit)(x, y) + app_log.info('Done training...') + joblib.dump(model, path) + app_log.info(f'{name}: Model saved at {path}.') + except Exception as exc: + app_log.exception(exc) return model @@ -99,45 +102,17 @@ def _score_transformer(model, data): return {'roc_auc': score} -# ToDo: Use gramex.config.locate -def search_modelclass(mclass): - for module in MLCLASS_MODULES + [mclass]: - cls = pydoc.locate(f'{module}.{mclass}') - if cls: - return cls - msg = f'Model {mclass} not found. Please provide a full Python path.' - raise HTTPError(NOT_FOUND, reason=msg) - - -def is_categorical(s, num_treshold=0.1): - """Check if a series contains a categorical variable. - - Parameters - ---------- - s : pd.Series - - Returns - ------- - bool: - Whether the series is categorical. - uniques / count <= num_treshold / log(count) - """ - if pd.api.types.is_numeric_dtype(s): - return s.nunique() / s.shape[0] <= num_treshold - return True - - def move_to_cpu(model): getattr(model, 'model', model).to('cpu') -class BaseMLHandler(FormHandler): +class MLHandler(FormHandler): @classmethod - def setup(cls, data=None, model=None, config_dir='', **kwargs): + def setup(cls, data=None, model={}, backend="", config_dir='', **kwargs): cls.slug = slugify(cls.name) + cls.backend = backend # Create the config store directory - # use config_dir or DEFAULT_VALUE if not config_dir: config_dir = op.join(gramex.config.variables['GRAMEXDATA'], 'apps', 'mlhandler', cls.slug) @@ -147,25 +122,91 @@ def setup(cls, data=None, model=None, config_dir='', **kwargs): cls.data_store = op.join(cls.config_dir, 'data.h5') cls.template = kwargs.pop('template', DEFAULT_TEMPLATE) - super(BaseMLHandler, cls).setup(**kwargs) + super(MLHandler, cls).setup(**kwargs) - @classmethod - def store_data(cls, df, append=False): - df.to_hdf(cls.data_store, format="table", key="data", append=append) try: - rdf = gramex.cache.open(cls.data_store, key="data") - except KeyError: - rdf = df - return rdf + if 'transform' in data: + data['transform'] = build_transform( + {'function': data['transform']}, + vars={'data': None, 'handler': None}, + filename='MLHandler:data', iter=False) + cls._built_transform = staticmethod(data['transform']) + else: + cls._built_transform = staticmethod(lambda x: x) + data = gdata.filter(**data) + cls.store_data(data) + except TypeError: + app_log.warning('MLHandler could not find training data.') + data = None + cls._built_transform = staticmethod(lambda x: x) + + if cls.backend == "transformers": + task = kwargs['task'] + if not TRANSFORMERS_INSTALLED: + raise ImportError('pip install transformers') + cls.load_transformer(task, model) + else: + default_model_path = op.join(cls.config_dir, slugify(cls.name) + '.pkl') + cls.model_path = model.pop('path', default_model_path) + + + # store the model kwargs from gramex.yaml into the store + for key in TRANSFORMS: + cls.set_opt(key, model.get(key, cls.get_opt(key))) + # Remove target_col if it appears anywhere in cats or nums + target_col = cls.get_opt('target_col') + cls.set_opt('cats', list(set(cls.get_opt('cats')) - {target_col})) + cls.set_opt('nums', list(set(cls.get_opt('nums')) - {target_col})) + + cls.set_opt('class', model.get('class')) + cls.set_opt('params', model.get('params', {})) + if op.exists(cls.model_path): # If the pkl exists, load it + cls.model = joblib.load(cls.model_path) + elif data is not None: + mclass = cls.get_opt('class', model.get('class', False)) + params = cls.get_opt('params', {}) + data = cls._filtercols(data) + data = cls._filterrows(data) + cls.model = cls._assemble_pipeline(data, mclass=mclass, params=params) + + # train the model + target = data[target_col] + train = data[[c for c in data if c != target_col]] + gramex.service.threadpool.submit( + _fit, cls.model, train, target, cls.model_path, cls.name) + cls.config_store.flush() @classmethod - def load_data(cls): + def load_data(cls, default=pd.DataFrame()): try: df = gramex.cache.open(cls.data_store, key="data") except (KeyError, FileNotFoundError): - df = pd.DataFrame() + df = default return df + @classmethod + def load_transformer(cls, task, _model={}): + default_model_path = op.join( + gramex.config.variables['GRAMEXDATA'], 'apps', 'mlhandler', + slugify(cls.name)) + path = _model.get('path', default_model_path) + cls.model_path = path + # try loading from model_path + kwargs = {} + try: + kwargs['model'] = AutoModelForSequenceClassification.from_pretrained(cls.model_path) + kwargs['tokenizer'] = AutoTokenizer.from_pretrained(cls.model_path) + except Exception as err: + app_log.warning(f'Could not load model from {cls.model_path}.') + app_log.warning(f'{err}') + model = pipeline(task, **kwargs) + cls.model = model + + @classmethod + def store_data(cls, df, append=False): + df.to_hdf(cls.data_store, format="table", key="data", append=append) + return cls.load_data(df) + @classmethod def get_opt(cls, key, default=None): return cls.config_store.load('transform', {}).get( @@ -192,33 +233,37 @@ def set_opt(cls, key, value): cls.config_store.changed = True cls.config_store.flush() - def _parse_data(self, _cache=True, append=False): - # First look in self.request.files - if len(self.request.files) > 0: - dfs = [] - for _, files in self.request.files.items(): - for f in files: - buff = BytesIO(f['body']) - try: - ext = re.sub('^\.', '', op.splitext(f['filename'])[-1]) - xdf = cache.open_callback['jsondata' if ext == 'json' else ext](buff) - except KeyError: - raise HTTPError(BAD_REQUEST, reason=f"File extension {ext} not supported.") - dfs.append(xdf) - data = pd.concat(dfs, axis=0) - # Otherwise look in request.body - else: - if self.request.headers.get('Content-Type', '') == 'application/json': + def _parse_multipart_form_data(self): + dfs = [] + for _, files in self.request.files.items(): + for f in files: + buff = BytesIO(f['body']) try: - data = pd.read_json(self.request.body.decode('utf8')) - except ValueError: - data = self.load_data() - _cache = False - else: - data = pd.DataFrame.from_dict(parse_qs(self.request.body.decode('utf8'))) - if len(data) == 0: + ext = re.sub(r'^.', '', op.splitext(f['filename'])[-1]) + xdf = cache.open_callback['jsondata' if ext == 'json' else ext](buff) + dfs.append(xdf) + except KeyError: + app_log.warning(f"File extension {ext} not supported.") + continue + return pd.concat(dfs, axis=0) + + def _parse_application_json(self): + return pd.read_json(self.request.body.decode('utf8')) + + def _parse_data(self, _cache=True, append=False): + header = self.request.headers.get('Content-Type', '').split(';')[0] + header = slugify(header).replace('-', '_') + try: + data = getattr(self, f'_parse_{header}')() + except AttributeError: + app_log.warning(f"Content-Type {header} not supported, reading cached data.") + data = self.load_data() + except ValueError: + app_log.warning('Could not read data from request, reading cached data.') data = self.load_data() - elif _cache: + data = self._built_transform(data) + + if _cache: self.store_data(data, append) return data @@ -228,168 +273,52 @@ def _coerce_transformers_opts(self): return kwargs @classmethod - def load_transformer(cls, task, _model={}): - default_model_path = op.join( - gramex.config.variables['GRAMEXDATA'], 'apps', 'mlhandler', - slugify(cls.name)) - path = _model.get('path', default_model_path) - cls.model_path = path - # try loading from model_path - kwargs = {} - if task == "ner": - kwargs['grouped_entities'] = True - try: - kwargs['model'] = AutoModelForSequenceClassification.from_pretrained(cls.model_path) - kwargs['tokenizer'] = AutoTokenizer.from_pretrained(cls.model_path) - except Exception as err: - app_log.warning(f'Could not load model from {cls.model_path}.') - app_log.warning(f'{err}') - model = pipeline(task, **kwargs) - cls.model = model - - -class MLHandler(BaseMLHandler): - model_backend = "" - - @classmethod - def setup(cls, data=None, model={}, backend='sklearn', config_dir='', **kwargs): - - # From filehanlder: do the following - # cls.post = cls.put = cls.delete = cls.patch = cls.options = cls.get - # for clnmame in CLASSES: - # setattr(cls, method) = getattr(clname, method) - task = kwargs.pop('task', False) - # if backend == 'sklearn': - # SklearnHandler.fit(**kwargs) - # elif backend == 'transformers': - # NLPHandler.fit(**kwargs) - if backend != 'sklearn': - cls.model_backend = backend - if not TRANSFORMERS_INSTALLED: - raise ImportError('pip install transformers') - super(MLHandler, cls).setup(**kwargs) - cls.load_transformer(task, model) - else: - super(MLHandler, cls).setup(data, model, config_dir, **kwargs) - # Handle data if provided in the YAML config. - if isinstance(data, str): - data = cache.open(data) - elif isinstance(data, dict): - data = gdata.filter(**data) - else: - data = None - if data is not None: - cls.store_data(data) - - default_model_path = op.join( - gramex.config.variables['GRAMEXDATA'], 'apps', 'mlhandler', - slugify(cls.name) + '.pkl') - model_path = model.pop('path', default_model_path) - - # store the model kwargs from gramex.yaml into the store - for key in TRANSFORMS: - kwarg = model.get(key, False) - if not cls.get_opt(key, False) and kwarg: - cls.set_opt(key, kwarg) - if op.exists(model_path): # If the pkl exists, load it - cls.model = joblib.load(model_path) - cls.model_path = model_path - target_col = model.get('target_col', False) - if target_col: - cls.set_opt('target_col', target_col) - else: - target_col = cls.get_opt('target_col') - else: # build the model - mclass = cls.get_opt('class', model.get('class', False)) - params = cls.get_opt('params', {}) - if not params: - params = model.get('params', {}) - if mclass: - cls.model = search_modelclass(mclass)(**params) - cls.set_opt('class', mclass) - else: - cls.model = None - # Params MUST come after class, or they will be ignored. - cls.set_opt('params', params) - - if model_path: # if a path is specified, use to to store the model - cls.model_path = model_path - else: # or create our own path - cls.model_path = default_model_path - _mkdir(op.dirname(cls.model_path)) - - # train the model - target_col = model.get('target_col', False) - if target_col: - cls.set_opt('target_col', target_col) - else: - target_col = cls.get_opt('target_col', False) - if cls.model is not None and not target_col: - app_log.warning('Target column not defined. Nothing to do.') - else: - if cls.model is not None: - if data is not None: - # filter columns - data = cls._filtercols(data) - # filter rows - data = cls._filterrows(data) - - # assemble the pipeline - if model.get('pipeline', True): - cls.model = cls._get_pipeline(data) - else: - cls.model = search_modelclass(mclass)(**params) - - # train the model - target = data[target_col] - train = data[[c for c in data if c != target_col]] - gramex.service.threadpool.submit( - _fit, cls.model, train, target, cls.model_path, cls.name) - cls.config_store.flush() - - @classmethod - def _filtercols(cls, data): - include = cls.get_opt('include', []) + def _filtercols(cls, data, **kwargs): + include = kwargs.get('include', cls.get_opt('include', [])) if include: include += [cls.get_opt('target_col')] data = data[include] else: - exclude = cls.get_opt('exclude', []) + exclude = kwargs.get('exclude', cls.get_opt('exclude', [])) to_exclude = [c for c in exclude if c in data] if to_exclude: data = data.drop(to_exclude, axis=1) return data @classmethod - def _filterrows(cls, data): + def _filterrows(cls, data, **kwargs): for method in 'dropna drop_duplicates'.split(): - action = cls.get_opt(method, True) + action = kwargs.get(method, cls.get_opt(method, True)) if action: subset = action if isinstance(action, list) else None data = getattr(data, method)(subset=subset) return data @classmethod - def _get_pipeline(cls, data, force=False): + def _assemble_pipeline(cls, data, force=False, mclass='', params=None): # If the model exists, return it if op.exists(cls.model_path) and not force: return joblib.load(cls.model_path) - # Else assemble the model - nums = set(cls.get_opt('nums', [])) - cats = set(cls.get_opt('cats', [])) + # If preprocessing is not enabled, return the root model + if not cls.get_opt('pipeline', True): + return search_modelclass(mclass)(**params) + + # Else assemble the preprocessing pipeline + nums = set(cls.get_opt('nums', [])) - {cls.get_opt('target_col')} + cats = set(cls.get_opt('cats', [])) - {cls.get_opt('target_col')} both = nums.intersection(cats) if len(both) > 0: raise HTTPError(BAD_REQUEST, reason=f"Columns {both} cannot be both numerical and categorical.") to_guess = set(data.columns.tolist()) - nums.union(cats) - {cls.get_opt('target_col')} - categoricals = [c for c in to_guess if is_categorical(data[c])] - for c in categoricals: - to_guess.remove(c) - numericals = [c for c in to_guess if pd.api.types.is_numeric_dtype(data[c])] - categoricals += list(cats) - numericals += list(nums) - assert len(set(categoricals) & set(numericals)) == 0 + numericals = list(nums) + categoricals = list(cats) + for c in to_guess: + if pd.api.types.is_numeric_dtype(data[c]): + numericals.append(c) + else: + categoricals.append(c) ct = ColumnTransformer( [('ohe', OneHotEncoder(sparse=False), categoricals), @@ -407,22 +336,29 @@ def _transform(self, data, **kwargs): orgdata = self.load_data() for col in data: data[col] = data[col].astype(orgdata[col].dtype) - data = self._filtercols(data) - data = self._filterrows(data) + data = self._filtercols(data, **kwargs) + data = self._filterrows(data, **kwargs) return data def _predict(self, data=None, score_col=''): if data is None: data = self._parse_data(False) - data = self._transform(data, deduplicate=False) + data = self._transform(data, drop_duplicates=False) self.model = cache.open(self.model_path, joblib.load) try: target = data.pop(score_col) + metric = self.get_argument('_metric', False) + if metric: + scorer = get_scorer(metric) + return scorer(self.model, data, target) return self.model.score(data, target) except KeyError: # Set data in the same order as the transformer requests - data = data[self.model.named_steps['transform']._feature_names_in] - data[self.get_opt('target_col', '_prediction')] = self.model.predict(data) + try: + data = data[self.model.named_steps['transform']._feature_names_in] + data[self.get_opt('target_col', '_prediction')] = self.model.predict(data) + except Exception as exc: + app_log.exception(exc) return data def _check_model_path(self): @@ -450,10 +386,10 @@ def get(self, *path_args, **path_kwargs): self.write(json.dumps(params, indent=2)) elif '_cache' in self.args: self.write(self.load_data().to_json(orient='records')) - elif 'text' in self.args: + elif self.backend == "transformers" and 'text' in self.args: text = self.get_arguments('text') result = yield gramex.service.threadpool.submit(self.model, text) - self.write(json.dumps(result, indent=2)) + self.write(json.dumps(result, indent=2)) else: self._check_model_path() if '_download' in self.args: @@ -478,7 +414,6 @@ def get(self, *path_args, **path_kwargs): if len(data) > 0: self.set_header('Content-Type', 'application/json') data = data.drop([self.get_opt('target_col')], axis=1, errors='ignore') - # if action in ('predict', 'score'): prediction = yield gramex.service.threadpool.submit( self._predict, data) self.write(json.dumps(prediction, indent=2, cls=CustomJSONEncoder)) @@ -498,7 +433,7 @@ def _train(self, data=None): data = self._filterrows(data) target = data[target_col] train = data[[c for c in data if c != target_col]] - self.model = self._get_pipeline(data, force=True) + self.model = self._assemble_pipeline(data, force=True) _fit(self.model, train, target, self.model_path) return {'score': self.model.score(train, target)} @@ -516,8 +451,8 @@ def _score(self): def post(self, *path_args, **path_kwargs): action = self.args.pop('_action', 'predict') if action not in ACTIONS: - raise ValueError(f'Action {action} not supported.') - if self.model_backend == "transformers": + raise HTTPError(BAD_REQUEST, f'Action {action} not supported.') + if self.backend == "transformers": data = self._parse_data(_cache=False) move_to_cpu(self.model) kwargs = {} @@ -535,6 +470,7 @@ def post(self, *path_args, **path_kwargs): res = yield gramex.service.threadpool.submit(getattr(self, f"_{action}")) self.write(json.dumps(res, indent=2, cls=CustomJSONEncoder)) super(MLHandler, self).post(*path_args, **path_kwargs) + def get_cached_arg(self, argname): val = self.get_arg(argname, self.get_opt(argname)) @@ -548,21 +484,15 @@ def put(self, *path_args, **path_kwargs): params = self.get_opt('params', {}) if mclass: # parse the params as the signature dictates - for param in signature(search_modelclass(mclass)).parameters: - if param in self.args: - value = self.args.pop(param) - # if len(value) == 1: - # value = value[0] - params[param] = value + for param in signature(search_modelclass(mclass)).parameters & self.args.keys(): + value = self.args.pop(param) + params[param] = value # Since model params are changing, remove the model on disk self.model = None safe_rmtree(self.model_path, gramexdata=False) self.set_opt('params', params) for opt in TRANSFORMS.keys() & self.args.keys(): val = self.args.pop(opt) - # if not isinstance(TRANSFORMS[opt], list): - # if isinstance(val, list) and len(val) == 1: - # val = val[0] self.set_opt(opt, val) self.config_store.flush() From dac5359ca9c02e669bb50c4dd352a1e2373f4688 Mon Sep 17 00:00:00 2001 From: Sanket Verma Date: Wed, 7 Jul 2021 03:23:45 +0530 Subject: [PATCH 16/28] Add space --- gramex/handlers/mlhandler.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gramex/handlers/mlhandler.py b/gramex/handlers/mlhandler.py index 7dd68d6f4..66d7f85a7 100644 --- a/gramex/handlers/mlhandler.py +++ b/gramex/handlers/mlhandler.py @@ -51,6 +51,7 @@ 'cats': [], 'target_col': None } + TRANSFORMERS_DEFAULTS = dict( num_train_epochs=1, per_device_train_batch_size=16, From e9c6bb2a120158b301d82b5f6d88d07696953824 Mon Sep 17 00:00:00 2001 From: Sanket Verma Date: Wed, 7 Jul 2021 15:37:53 +0530 Subject: [PATCH 17/28] Add new install.py --- gramex/install.py | 79 +++++++++++++++++++++-------------------------- 1 file changed, 36 insertions(+), 43 deletions(-) diff --git a/gramex/install.py b/gramex/install.py index 3177ceab2..c1029b663 100644 --- a/gramex/install.py +++ b/gramex/install.py @@ -12,7 +12,6 @@ import shutil import datetime import requests -from glob import glob from shutilwhich import which from pathlib import Path from subprocess import Popen, check_output, CalledProcessError # nosec @@ -124,13 +123,19 @@ gramex service stop init: | - usage: gramex init [--target=DIR] + gramex init [--target=DIR] + gramex init minimal [--target=DIR] Initializes a Gramex project at the current or target dir. Specifically, it: - Sets up a git repo - - Install supporting files for a gramex project + - Install supporting files for a Gramex project from a template + - "gramex init" sets up dependencies for a local system + - "gramex init minimal" sets up minimal dependencies - Runs gramex setup (which runs yarn/npm install and other dependencies) + Options: + --target # Location to install at. Defaults to + mail: | gramex mail # Send mail named gramex mail --list # Lists all keys in config file @@ -606,37 +611,17 @@ def _mkdir(path): os.makedirs(path) -def _copy(source, target, template_data=None): - ''' - Copy single directory or file (as binary) from source to target. - Warn if target exists, or source is not file/directory, and exit. - If template_data is specified, treat source as a Tornado template. - ''' - if os.path.exists(target): - app_log.warning('Skip existing %s', target) - elif os.path.isdir(source): - _mkdir(target) - elif os.path.isfile(source): - app_log.info('Copy file %s', source) - with io.open(source, 'rb') as handle: - result = handle.read() - from mimetypes import guess_type - filetype = guess_type(source)[0] - basetype = 'text' if filetype is None else filetype.split('/')[0] - if template_data is not None: - if basetype in {'text'} or filetype in {'application/javascript'}: - result = Template(result).generate(**template_data) - with io.open(target, 'wb') as handle: - handle.write(result) - else: - app_log.warning('Skip unknown file %s', source) - - def init(args, kwargs): '''Create Gramex scaffolding files.''' - if len(args) > 1: + if len(args) > 2: app_log.error(show_usage('init')) return + if len(args) == 0: + args.append('default') + source_dir = os.path.join(variables['GRAMEXPATH'], 'apps', 'init', args[0]) + if not os.path.exists(source_dir): + app_log.error(f'Unknown init template {args[0]}') + kwargs.setdefault('target', os.getcwd()) app_log.info('Initializing Gramex project at %s', kwargs.target) data = { @@ -666,22 +651,30 @@ def init(args, kwargs): except OSError: data['git_lfs'] = None - # Copy all directories & files (as templates) - source_dir = os.path.join(variables['GRAMEXPATH'], 'apps', 'init') + # Copy all directories & files. Files with '.template.' are treated as templates. for root, dirs, files in os.walk(source_dir): + relpath = os.path.relpath(root, start=source_dir) for name in dirs + files: source = os.path.join(root, name) - relpath = os.path.relpath(root, start=source_dir) - target = os.path.join(kwargs.target, relpath, name.replace('appname', appname)) - _copy(source, target, template_data=data) - for empty_dir in ('img', 'data'): - _mkdir(os.path.join(kwargs.target, 'assets', empty_dir)) - # Copy error files as-is (not as templates) - error_dir = os.path.join(kwargs.target, 'error') - _mkdir(error_dir) - for source in glob(os.path.join(variables['GRAMEXPATH'], 'handlers', '?0?.html')): - target = os.path.join(error_dir, os.path.basename(source)) - _copy(source, target) + targetname = name.replace('$appname', appname) + template_data = None + if '.template.' in name: + targetname, template_data = name.replace('.template.', '.'), data + target = os.path.join(kwargs.target, relpath, targetname) + if os.path.exists(target): + app_log.warning('Skip existing %s', target) + elif os.path.isdir(source): + _mkdir(target) + elif os.path.isfile(source): + app_log.info('Copy file %s', source) + with io.open(source, 'rb') as handle: + result = handle.read() + if template_data is not None: + result = Template(result).generate(**template_data) + with io.open(target, 'wb') as handle: + handle.write(result) + else: + app_log.warning('Skip unknown file %s', source) run_setup(kwargs.target) From 07807c72a332c26b4cac2b66df1a0f81ff84e5c5 Mon Sep 17 00:00:00 2001 From: Sanket Verma Date: Sat, 10 Jul 2021 00:50:51 +0530 Subject: [PATCH 18/28] Remove space --- gramex/handlers/mlhandler.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gramex/handlers/mlhandler.py b/gramex/handlers/mlhandler.py index 66d7f85a7..7dd68d6f4 100644 --- a/gramex/handlers/mlhandler.py +++ b/gramex/handlers/mlhandler.py @@ -51,7 +51,6 @@ 'cats': [], 'target_col': None } - TRANSFORMERS_DEFAULTS = dict( num_train_epochs=1, per_device_train_batch_size=16, From 4ee107d90d946ce6feaaaf06935b235153230410 Mon Sep 17 00:00:00 2001 From: Sanket Verma Date: Sat, 10 Jul 2021 23:38:03 +0530 Subject: [PATCH 19/28] Remove merge conflicts test_mlhandler.py --- tests/test_mlhandler.py | 62 +++++++++++++++++++++++++++++++++-------- 1 file changed, 51 insertions(+), 11 deletions(-) diff --git a/tests/test_mlhandler.py b/tests/test_mlhandler.py index fc48580c1..4008fc946 100644 --- a/tests/test_mlhandler.py +++ b/tests/test_mlhandler.py @@ -35,13 +35,21 @@ def setUpClass(cls): 'mlhandler-config/data.h5', 'mlhandler-incr/config.json', 'mlhandler-incr/data.h5', - 'mlhandler-blank.pkl', - 'mlhandler-incr.pkl', - 'mlhandler-nopath.pkl', + 'mlhandler-incr/mlhandler-incr.pkl', + 'mlhandler-xform/config.json', + 'mlhandler-xform/data.h5', + 'mlhandler-xform/mlhandler-xform.pkl', + 'mlhandler-blank/mlhandler-blank.pkl', + 'mlhandler-nopath/mlhandler-nopath.pkl', + 'mlhandler-badcol/config.json', + 'mlhandler-badcol/data.h5', + 'mlhandler-badcol/mlhandler-badcol.pkl', ]] paths += [op.join(folder, 'model.pkl')] for p in paths: tempfiles[p] = p + circles = op.join(folder, 'circles.csv') + tempfiles[circles] = circles def test_append(self): try: @@ -118,7 +126,12 @@ def test_blank_slate(self): { 'target_col': 'species', 'exclude': ['petal_width'], - 'nums': ['sepal_length', 'sepal_width', 'petal_length'] + 'nums': ['sepal_length', 'sepal_width', 'petal_length'], + 'include': [], + 'pipeline': True, + 'drop_duplicates': True, + 'dropna': True, + 'cats': [] } ) self.assertDictEqual( @@ -253,6 +266,11 @@ def test_get_bulk_score(self): data=self.df.to_json(orient='records'), headers={'Content-Type': 'application/json'}) self.assertGreaterEqual(resp.json()['score'], self.ACC_TOL) + resp = self.get( + '/mlhandler?_action=score&_metric=f1_weighted', method='post', + data=self.df.to_json(orient='records'), + headers={'Content-Type': 'application/json'}) + self.assertGreaterEqual(resp.json()['score'], self.ACC_TOL) def test_get_cache(self): df = pd.DataFrame.from_records(self.get('/mlhandler?_cache=true').json()) @@ -262,22 +280,22 @@ def test_get_model_params(self): params = self.get('/mlhandler?_model').json() self.assertDictEqual(LogisticRegression().get_params(), params) - def test_get_predictions(self, target_col='species'): + def test_get_predictions(self, root="mlhandler", target_col='species'): resp = self.get( - '/mlhandler?sepal_length=5.9&sepal_width=3&petal_length=5.1&petal_width=1.8') + f'/{root}?sepal_length=5.9&sepal_width=3&petal_length=5.1&petal_width=1.8') self.assertEqual(resp.json(), [ {'sepal_length': 5.9, 'sepal_width': 3.0, 'petal_length': 5.1, 'petal_width': 1.8, target_col: 'virginica'} ]) resp = self.get( - '/mlhandler?sepal_width=3&petal_length=5.1&sepal_length=5.9&petal_width=1.8') + f'/{root}?sepal_width=3&petal_length=5.1&sepal_length=5.9&petal_width=1.8') self.assertEqual(resp.json(), [ {'sepal_length': 5.9, 'sepal_width': 3.0, 'petal_length': 5.1, 'petal_width': 1.8, target_col: 'virginica'} ]) - req = '/mlhandler?' + req = f'/{root}?' samples = [] target = [] for row in self.df.sample(n=5).to_dict(orient='records'): @@ -299,6 +317,18 @@ def test_get_predictions_post_file(self): pred = pd.DataFrame.from_records(resp.json())['species'] self.assertGreaterEqual(accuracy_score(target, pred), self.ACC_TOL) + def test_get_predictions_duplicates(self): + df = self.df.drop_duplicates() + df = pd.concat([df, df], axis=0, ignore_index=True) + target = df.pop('species') + buff = StringIO() + df.to_csv(buff, index=False, encoding='utf8') + buff.seek(0) + resp = self.get('/mlhandler?_action=predict', + method='post', files={'file': ('iris.csv', buff)}) + pred = pd.DataFrame.from_records(resp.json())['species'] + self.assertGreaterEqual(accuracy_score(target, pred), self.ACC_TOL) + def test_get_predictions_post_json_file(self): df = self.df.drop_duplicates() target = df.pop('species') @@ -311,8 +341,9 @@ def test_get_predictions_post_json_file(self): self.assertGreaterEqual(accuracy_score(target, pred), self.ACC_TOL) def test_model_default_path(self): - clf = joblib.load(op.join( - gramex.config.variables['GRAMEXDATA'], 'apps', 'mlhandler', 'mlhandler-nopath.pkl')) + clf = joblib.load( + op.join(gramex.config.variables['GRAMEXDATA'], 'apps', 'mlhandler', + 'mlhandler-nopath', 'mlhandler-nopath.pkl')) self.assertIsInstance(clf, Pipeline) self.assertIsInstance(clf.named_steps['transform'], ColumnTransformer) self.assertIsInstance(clf.named_steps['LogisticRegression'], LogisticRegression) @@ -423,7 +454,7 @@ def test_template(self): r = self.get('/mlhandler') self.assertEqual(r.status_code, OK) # Try getting predictions - self.test_get_predictions('target') + self.test_get_predictions(target_col='target') self.test_get_bulk_predictions('target') def test_train(self): @@ -443,3 +474,12 @@ def test_train(self): # TODO: The target_col has to be reset to species for a correct teardown. # But any PUT deletes an existing model and causes subsequent tests to fail. # Find an atomic way to reset configurations. + + def test_datatransform(self): + with open(op.join(op.dirname(__file__), 'circles.csv'), 'r', encoding='utf8') as fin: + resp = self.get('/mltransform?_action=score', method='post', + files={'file': ('circles.csv', fin.read())}) + self.assertEqual(resp.json()['score'], 1) + + def test_invalid_category(self): + self.test_get_predictions('mlhandlerbadcol') From e4f59e07ad48c7c81bcbe36ac0281f5965cdf9fc Mon Sep 17 00:00:00 2001 From: Sanket Verma Date: Sun, 11 Jul 2021 03:34:12 +0530 Subject: [PATCH 20/28] Remove unnecessary space & lines --- gramex/handlers/mlhandler.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/gramex/handlers/mlhandler.py b/gramex/handlers/mlhandler.py index 00acd2395..ad94550c8 100644 --- a/gramex/handlers/mlhandler.py +++ b/gramex/handlers/mlhandler.py @@ -175,7 +175,7 @@ def setup(cls, data=None, model={}, backend="", config_dir='', **kwargs): gramex.service.threadpool.submit( _fit, cls.model, train, target, cls.model_path, cls.name) cls.config_store.flush() - + @classmethod def load_data(cls, default=pd.DataFrame()): try: @@ -471,7 +471,6 @@ def post(self, *path_args, **path_kwargs): self.write(json.dumps(res, indent=2, cls=CustomJSONEncoder)) super(MLHandler, self).post(*path_args, **path_kwargs) - def get_cached_arg(self, argname): val = self.get_arg(argname, self.get_opt(argname)) self.set_opt(argname, val) From 2f00a8e8e896554fa09f90c4f1a4e0c0d4610564 Mon Sep 17 00:00:00 2001 From: Sanket Verma Date: Tue, 13 Jul 2021 19:19:57 +0530 Subject: [PATCH 21/28] Restore formhandler.py to original version & Minor Change to mlhandler.py --- gramex/handlers/formhandler.py | 1 - gramex/handlers/mlhandler.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/gramex/handlers/formhandler.py b/gramex/handlers/formhandler.py index 771693fd2..9bf9f4af5 100644 --- a/gramex/handlers/formhandler.py +++ b/gramex/handlers/formhandler.py @@ -53,7 +53,6 @@ def setup(cls, **kwargs): cls.headers = conf_kwargs.pop('headers', {}) # Top level formats: key is special. Don't treat it as data cls.formats = conf_kwargs.pop('formats', {}) - cls.task = conf_kwargs.pop('task', {}) default_config = conf_kwargs.pop('default', None) # Remove other known special keys from dataset configuration cls.clear_special_keys(conf_kwargs) diff --git a/gramex/handlers/mlhandler.py b/gramex/handlers/mlhandler.py index ad94550c8..7fb8ee5d6 100644 --- a/gramex/handlers/mlhandler.py +++ b/gramex/handlers/mlhandler.py @@ -122,6 +122,7 @@ def setup(cls, data=None, model={}, backend="", config_dir='', **kwargs): cls.data_store = op.join(cls.config_dir, 'data.h5') cls.template = kwargs.pop('template', DEFAULT_TEMPLATE) + cls.task = kwargs.pop('task') super(MLHandler, cls).setup(**kwargs) try: @@ -141,10 +142,9 @@ def setup(cls, data=None, model={}, backend="", config_dir='', **kwargs): cls._built_transform = staticmethod(lambda x: x) if cls.backend == "transformers": - task = kwargs['task'] if not TRANSFORMERS_INSTALLED: raise ImportError('pip install transformers') - cls.load_transformer(task, model) + cls.load_transformer(cls.task, model) else: default_model_path = op.join(cls.config_dir, slugify(cls.name) + '.pkl') cls.model_path = model.pop('path', default_model_path) From e84a064714ba5aded67e0ab37c95bcfce3fd0ef0 Mon Sep 17 00:00:00 2001 From: Sanket Verma Date: Tue, 13 Jul 2021 19:25:54 +0530 Subject: [PATCH 22/28] Remove unused function --- gramex/handlers/mlhandler.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/gramex/handlers/mlhandler.py b/gramex/handlers/mlhandler.py index 7fb8ee5d6..82efcdc38 100644 --- a/gramex/handlers/mlhandler.py +++ b/gramex/handlers/mlhandler.py @@ -470,11 +470,6 @@ def post(self, *path_args, **path_kwargs): res = yield gramex.service.threadpool.submit(getattr(self, f"_{action}")) self.write(json.dumps(res, indent=2, cls=CustomJSONEncoder)) super(MLHandler, self).post(*path_args, **path_kwargs) - - def get_cached_arg(self, argname): - val = self.get_arg(argname, self.get_opt(argname)) - self.set_opt(argname, val) - return val def get_cached_arg(self, argname): val = self.get_arg(argname, self.get_opt(argname)) From dd3ec9c838f1b3faf2688f6528b0ebb077dc9077 Mon Sep 17 00:00:00 2001 From: Sanket Verma Date: Tue, 20 Jul 2021 20:32:22 +0530 Subject: [PATCH 23/28] Rename dl_utils.py to dl.py --- gramex/dl.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 gramex/dl.py diff --git a/gramex/dl.py b/gramex/dl.py new file mode 100644 index 000000000..064b7d157 --- /dev/null +++ b/gramex/dl.py @@ -0,0 +1,16 @@ +from torch.utils.data import Dataset +import torch + + +class SentimentDataset(Dataset): + def __init__(self, encodings, labels): + self.encodings = encodings + self.labels = labels + + def __getitem__(self, idx): + item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} + item['labels'] = torch.tensor(self.labels[idx]).to(torch.int64) + return item + + def __len__(self): + return len(self.labels) From d6a7132ab16d205ec81edf862f2f7b4fc04ded17 Mon Sep 17 00:00:00 2001 From: Sanket Verma Date: Tue, 20 Jul 2021 20:33:55 +0530 Subject: [PATCH 24/28] Remove dl_utils.py --- gramex/dl_utils.py | 16 ---------------- 1 file changed, 16 deletions(-) delete mode 100644 gramex/dl_utils.py diff --git a/gramex/dl_utils.py b/gramex/dl_utils.py deleted file mode 100644 index 064b7d157..000000000 --- a/gramex/dl_utils.py +++ /dev/null @@ -1,16 +0,0 @@ -from torch.utils.data import Dataset -import torch - - -class SentimentDataset(Dataset): - def __init__(self, encodings, labels): - self.encodings = encodings - self.labels = labels - - def __getitem__(self, idx): - item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} - item['labels'] = torch.tensor(self.labels[idx]).to(torch.int64) - return item - - def __len__(self): - return len(self.labels) From 2d2cfb182692c2028b848cb7bcd9bc799a5cad09 Mon Sep 17 00:00:00 2001 From: Sanket Verma Date: Tue, 20 Jul 2021 20:36:25 +0530 Subject: [PATCH 25/28] Fix imports --- gramex/handlers/mlhandler.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/gramex/handlers/mlhandler.py b/gramex/handlers/mlhandler.py index 82efcdc38..a63e21401 100644 --- a/gramex/handlers/mlhandler.py +++ b/gramex/handlers/mlhandler.py @@ -12,6 +12,7 @@ from gramex.http import NOT_FOUND, BAD_REQUEST from gramex.install import _mkdir, safe_rmtree from gramex import cache +from gramex.dl import SentimentDataset import joblib import pandas as pd from sklearn.compose import ColumnTransformer @@ -22,14 +23,9 @@ from tornado.gen import coroutine from tornado.web import HTTPError from sklearn.metrics import get_scorer -try: - from transformers import pipeline - from transformers import AutoModelForSequenceClassification, AutoTokenizer - from transformers import Trainer, TrainingArguments - from gramex.dl_utils import SentimentDataset - TRANSFORMERS_INSTALLED = True -except ImportError: - TRANSFORMERS_INSTALLED = False +from transformers import pipeline +from transformers import AutoModelForSequenceClassification, AutoTokenizer +from transformers import Trainer, TrainingArguments op = os.path MLCLASS_MODULES = [ @@ -142,8 +138,6 @@ def setup(cls, data=None, model={}, backend="", config_dir='', **kwargs): cls._built_transform = staticmethod(lambda x: x) if cls.backend == "transformers": - if not TRANSFORMERS_INSTALLED: - raise ImportError('pip install transformers') cls.load_transformer(cls.task, model) else: default_model_path = op.join(cls.config_dir, slugify(cls.name) + '.pkl') From 326b93d9cf757286c2881fdfd9c82d5c92c034b1 Mon Sep 17 00:00:00 2001 From: Sanket Verma Date: Fri, 6 Aug 2021 04:17:38 +0530 Subject: [PATCH 26/28] Add changes to support new gramex.yaml config. --- gramex/handlers/mlhandler.py | 84 +++++++++++++++++++++++++----------- 1 file changed, 59 insertions(+), 25 deletions(-) diff --git a/gramex/handlers/mlhandler.py b/gramex/handlers/mlhandler.py index a63e21401..34cf03648 100644 --- a/gramex/handlers/mlhandler.py +++ b/gramex/handlers/mlhandler.py @@ -73,7 +73,7 @@ def _fit(model, x, y, path=None, name=None): def _train_transformer(model, data, model_path, **kwargs): - enc = model.tokenizer(data['text'].tolist(), truncation=True, padding=True) + enc = model.tokenizer(data['_text'].values.tolist(), truncation=True, padding=True) labels = SENTIMENT_LENC.transform(data['label']) train_dataset = SentimentDataset(enc, labels) model_output_dir = op.join(op.dirname(model_path), 'results') @@ -83,7 +83,7 @@ def _train_transformer(model, data, model_path, **kwargs): Trainer(model=model.model, args=trargs, train_dataset=train_dataset).train() model.save_pretrained(model_path) move_to_cpu(model) - pred = model(data['text'].tolist()) + pred = model(data['_text'].values.tolist()) res = { 'roc_auc': roc_auc_score( labels, SENTIMENT_LENC.transform([c['label'] for c in pred])) @@ -92,7 +92,7 @@ def _train_transformer(model, data, model_path, **kwargs): def _score_transformer(model, data): - pred = model(data['text'].tolist()) + pred = model(data['_text'].values.tolist()) score = roc_auc_score( *map(SENTIMENT_LENC.transform, (data['label'], [c['label'] for c in pred]))) return {'roc_auc': score} @@ -107,7 +107,8 @@ class MLHandler(FormHandler): @classmethod def setup(cls, data=None, model={}, backend="", config_dir='', **kwargs): cls.slug = slugify(cls.name) - cls.backend = backend + cls.backend = model.get('backend') + cls.sentiment_df = pd.DataFrame() # Create the config store directory if not config_dir: config_dir = op.join(gramex.config.variables['GRAMEXDATA'], 'apps', 'mlhandler', @@ -118,7 +119,7 @@ def setup(cls, data=None, model={}, backend="", config_dir='', **kwargs): cls.data_store = op.join(cls.config_dir, 'data.h5') cls.template = kwargs.pop('template', DEFAULT_TEMPLATE) - cls.task = kwargs.pop('task') + cls.mclass = model.get('class') super(MLHandler, cls).setup(**kwargs) try: @@ -136,24 +137,28 @@ def setup(cls, data=None, model={}, backend="", config_dir='', **kwargs): app_log.warning('MLHandler could not find training data.') data = None cls._built_transform = staticmethod(lambda x: x) + + # store the model kwargs from gramex.yaml into the store + for key in TRANSFORMS: + cls.set_opt(key, model.get(key, cls.get_opt(key))) + # Remove target_col if it appears anywhere in cats or nums + target_col = cls.get_opt('target_col') + cls.set_opt('cats', list(set(cls.get_opt('cats')) - {target_col})) + cls.set_opt('nums', list(set(cls.get_opt('nums')) - {target_col})) - if cls.backend == "transformers": - cls.load_transformer(cls.task, model) - else: - default_model_path = op.join(cls.config_dir, slugify(cls.name) + '.pkl') - cls.model_path = model.pop('path', default_model_path) + cls.set_opt('class', model.get('class')) + cls.set_opt('params', model.get('params', {})) - # store the model kwargs from gramex.yaml into the store - for key in TRANSFORMS: - cls.set_opt(key, model.get(key, cls.get_opt(key))) - # Remove target_col if it appears anywhere in cats or nums - target_col = cls.get_opt('target_col') - cls.set_opt('cats', list(set(cls.get_opt('cats')) - {target_col})) - cls.set_opt('nums', list(set(cls.get_opt('nums')) - {target_col})) - - cls.set_opt('class', model.get('class')) - cls.set_opt('params', model.get('params', {})) + if cls.backend == "transformers": + cls.load_transformer(cls.mclass, model) + if data is not None: + data = cls._filtercols(data) + data = cls._filterrows(data) + cls._concatenate(data) + else: + cls.model_path = model.pop('path', default_model_path) + default_model_path = op.join(cls.config_dir, slugify(cls.name) + '.pkl') if op.exists(cls.model_path): # If the pkl exists, load it cls.model = joblib.load(cls.model_path) elif data is not None: @@ -183,8 +188,7 @@ def load_transformer(cls, task, _model={}): default_model_path = op.join( gramex.config.variables['GRAMEXDATA'], 'apps', 'mlhandler', slugify(cls.name)) - path = _model.get('path', default_model_path) - cls.model_path = path + cls.model_path = _model.get('path', default_model_path) # try loading from model_path kwargs = {} try: @@ -326,6 +330,29 @@ def _assemble_pipeline(cls, data, force=False, mclass='', params=None): return Pipeline([('transform', ct), (model.__class__.__name__, model)]) return cls.model + @classmethod + def _concatenate(cls, data): + cats = set(cls.get_opt('cats', [])) + for cat in cats: + if not data[cat].astype(str).all(): + raise HTTPError(BAD_REQUEST, + reason=f"Columns {cat} should contain string.") + + data.insert(0, column='_text', value='') + + for col in data: + if col in cats: + data['_text'] += data[col] + + cls.sentiment_df = data['_text'].copy() + cls.sentiment_df = cls.sentiment_df.to_frame(name='_text') + if 'label' in data.columns: + cls.sentiment_df['label'] = data['label'] + else: + app_log.error("Column: 'label' missing, training and scoring not available!") + data.drop('_text', axis=1) + cls.store_data(data) + def _transform(self, data, **kwargs): orgdata = self.load_data() for col in data: @@ -383,7 +410,7 @@ def get(self, *path_args, **path_kwargs): elif self.backend == "transformers" and 'text' in self.args: text = self.get_arguments('text') result = yield gramex.service.threadpool.submit(self.model, text) - self.write(json.dumps(result, indent=2)) + self.write(json.dumps(result, indent=2)) else: self._check_model_path() if '_download' in self.args: @@ -447,23 +474,30 @@ def post(self, *path_args, **path_kwargs): if action not in ACTIONS: raise HTTPError(BAD_REQUEST, f'Action {action} not supported.') if self.backend == "transformers": - data = self._parse_data(_cache=False) + data = self.sentiment_df move_to_cpu(self.model) kwargs = {} if action == 'train': + if 'label' not in data.columns: + raise HTTPError(BAD_REQUEST, + reason=f"Missing column named label(target values) from data.") kwargs = self._coerce_transformers_opts() kwargs['model_path'] = self.model_path args = _train_transformer, self.model, data elif action == 'score': + if 'label' not in data.columns: + raise HTTPError(BAD_REQUEST, + reason=f"Missing column named label(target values) from data.") args = _score_transformer, self.model, data elif action == 'predict': - args = self.model, data['text'].tolist() + args = self.model, data['_text'].values.tolist() res = yield gramex.service.threadpool.submit(*args, **kwargs) self.write(json.dumps(res, indent=2, cls=CustomJSONEncoder)) else: res = yield gramex.service.threadpool.submit(getattr(self, f"_{action}")) self.write(json.dumps(res, indent=2, cls=CustomJSONEncoder)) super(MLHandler, self).post(*path_args, **path_kwargs) + def get_cached_arg(self, argname): val = self.get_arg(argname, self.get_opt(argname)) From db0b20c11586ad2b767a4f989976058d87e15690 Mon Sep 17 00:00:00 2001 From: Sanket Verma Date: Fri, 6 Aug 2021 04:21:40 +0530 Subject: [PATCH 27/28] Remove spaces --- gramex/handlers/mlhandler.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gramex/handlers/mlhandler.py b/gramex/handlers/mlhandler.py index 34cf03648..1526c7d3b 100644 --- a/gramex/handlers/mlhandler.py +++ b/gramex/handlers/mlhandler.py @@ -336,8 +336,8 @@ def _concatenate(cls, data): for cat in cats: if not data[cat].astype(str).all(): raise HTTPError(BAD_REQUEST, - reason=f"Columns {cat} should contain string.") - + reason=f"Columns {cat} should contain string.") + data.insert(0, column='_text', value='') for col in data: @@ -350,8 +350,8 @@ def _concatenate(cls, data): cls.sentiment_df['label'] = data['label'] else: app_log.error("Column: 'label' missing, training and scoring not available!") - data.drop('_text', axis=1) - cls.store_data(data) + data.drop('_text', axis=1) + cls.store_data(data) def _transform(self, data, **kwargs): orgdata = self.load_data() From 7c59db38ea3744f432fb75feb6df1de0800611f2 Mon Sep 17 00:00:00 2001 From: Sanket Verma Date: Tue, 10 Aug 2021 02:51:08 +0530 Subject: [PATCH 28/28] Add changes to run PyTorch and Hugging Face only when requested and fix flake8 errors --- gramex/handlers/mlhandler.py | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/gramex/handlers/mlhandler.py b/gramex/handlers/mlhandler.py index 1526c7d3b..4431dc86f 100644 --- a/gramex/handlers/mlhandler.py +++ b/gramex/handlers/mlhandler.py @@ -12,7 +12,6 @@ from gramex.http import NOT_FOUND, BAD_REQUEST from gramex.install import _mkdir, safe_rmtree from gramex import cache -from gramex.dl import SentimentDataset import joblib import pandas as pd from sklearn.compose import ColumnTransformer @@ -23,9 +22,6 @@ from tornado.gen import coroutine from tornado.web import HTTPError from sklearn.metrics import get_scorer -from transformers import pipeline -from transformers import AutoModelForSequenceClassification, AutoTokenizer -from transformers import Trainer, TrainingArguments op = os.path MLCLASS_MODULES = [ @@ -75,9 +71,11 @@ def _fit(model, x, y, path=None, name=None): def _train_transformer(model, data, model_path, **kwargs): enc = model.tokenizer(data['_text'].values.tolist(), truncation=True, padding=True) labels = SENTIMENT_LENC.transform(data['label']) + from gramex.dl import SentimentDataset train_dataset = SentimentDataset(enc, labels) model_output_dir = op.join(op.dirname(model_path), 'results') model_log_dir = op.join(op.dirname(model_path), 'logs') + from transformers import Trainer, TrainingArguments trargs = TrainingArguments( output_dir=model_output_dir, logging_dir=model_log_dir, **kwargs) Trainer(model=model.model, args=trargs, train_dataset=train_dataset).train() @@ -108,7 +106,7 @@ class MLHandler(FormHandler): def setup(cls, data=None, model={}, backend="", config_dir='', **kwargs): cls.slug = slugify(cls.name) cls.backend = model.get('backend') - cls.sentiment_df = pd.DataFrame() + cls.sentiment_df = pd.DataFrame() # Create the config store directory if not config_dir: config_dir = op.join(gramex.config.variables['GRAMEXDATA'], 'apps', 'mlhandler', @@ -137,7 +135,7 @@ def setup(cls, data=None, model={}, backend="", config_dir='', **kwargs): app_log.warning('MLHandler could not find training data.') data = None cls._built_transform = staticmethod(lambda x: x) - + # store the model kwargs from gramex.yaml into the store for key in TRANSFORMS: cls.set_opt(key, model.get(key, cls.get_opt(key))) @@ -149,7 +147,6 @@ def setup(cls, data=None, model={}, backend="", config_dir='', **kwargs): cls.set_opt('class', model.get('class')) cls.set_opt('params', model.get('params', {})) - if cls.backend == "transformers": cls.load_transformer(cls.mclass, model) if data is not None: @@ -157,8 +154,9 @@ def setup(cls, data=None, model={}, backend="", config_dir='', **kwargs): data = cls._filterrows(data) cls._concatenate(data) else: - cls.model_path = model.pop('path', default_model_path) default_model_path = op.join(cls.config_dir, slugify(cls.name) + '.pkl') + cls.model_path = model.pop('path', default_model_path) + if op.exists(cls.model_path): # If the pkl exists, load it cls.model = joblib.load(cls.model_path) elif data is not None: @@ -191,6 +189,8 @@ def load_transformer(cls, task, _model={}): cls.model_path = _model.get('path', default_model_path) # try loading from model_path kwargs = {} + from transformers import pipeline + from transformers import AutoModelForSequenceClassification, AutoTokenizer try: kwargs['model'] = AutoModelForSequenceClassification.from_pretrained(cls.model_path) kwargs['tokenizer'] = AutoTokenizer.from_pretrained(cls.model_path) @@ -199,7 +199,7 @@ def load_transformer(cls, task, _model={}): app_log.warning(f'{err}') model = pipeline(task, **kwargs) cls.model = model - + @classmethod def store_data(cls, df, append=False): df.to_hdf(cls.data_store, format="table", key="data", append=append) @@ -336,7 +336,7 @@ def _concatenate(cls, data): for cat in cats: if not data[cat].astype(str).all(): raise HTTPError(BAD_REQUEST, - reason=f"Columns {cat} should contain string.") + reason=f"Columns {cat} should contain string.") data.insert(0, column='_text', value='') @@ -479,15 +479,17 @@ def post(self, *path_args, **path_kwargs): kwargs = {} if action == 'train': if 'label' not in data.columns: + app_log.error("Column: 'label' missing, training and scoring not available!") raise HTTPError(BAD_REQUEST, - reason=f"Missing column named label(target values) from data.") + reason=print("Missing column named label(target values) from data.")) kwargs = self._coerce_transformers_opts() kwargs['model_path'] = self.model_path args = _train_transformer, self.model, data elif action == 'score': if 'label' not in data.columns: + app_log.error("Column: 'label' missing, training and scoring not available!") raise HTTPError(BAD_REQUEST, - reason=f"Missing column named label(target values) from data.") + reason=print("Missing column named label(target values) from data.")) args = _score_transformer, self.model, data elif action == 'predict': args = self.model, data['_text'].values.tolist() @@ -495,9 +497,8 @@ def post(self, *path_args, **path_kwargs): self.write(json.dumps(res, indent=2, cls=CustomJSONEncoder)) else: res = yield gramex.service.threadpool.submit(getattr(self, f"_{action}")) - self.write(json.dumps(res, indent=2, cls=CustomJSONEncoder)) + self.write(json.dumps(res, indent=2, cls=CustomJSONEncoder)) super(MLHandler, self).post(*path_args, **path_kwargs) - def get_cached_arg(self, argname): val = self.get_arg(argname, self.get_opt(argname))