python/mxnet/model.py

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

# pylint: disable=fixme, invalid-name, too-many-arguments, too-many-locals, too-many-lines
# pylint: disable=too-many-branches, too-many-statements
"""MXNet model module"""

import os
import logging
from collections import namedtuple
import numpy as np

from . import ndarray as nd
from . import symbol as sym
from . import kvstore as kvs
from .device import cpu

BASE_ESTIMATOR = object

try:
    from sklearn.base import BaseEstimator
    BASE_ESTIMATOR = BaseEstimator
except ImportError:
    SKLEARN_INSTALLED = False

# Parameter to pass to batch_end_callback
BatchEndParam = namedtuple('BatchEndParams',
                           ['epoch',
                            'nbatch',
                            'eval_metric',
                            'locals'])

def _create_sparse_kvstore(kvstore):
    """Create kvstore assuming some parameters' storage types are row_sparse.

    Parameters
    ----------
    kvstore : KVStore or str
        The kvstore.

    Returns
    -------
    kvstore : KVStore
    update_on_kvstore : bool. Always True.
    """
    # always update on kvstore
    if isinstance(kvstore, kvs.KVStore):
        kv = kvstore
    elif isinstance(kvstore, str):
        kv = kvs.create(kvstore)
    else:
        raise TypeError(f"Cannot create '{kvstore}' KVStore with row_sparse parameters. "
                        "The type must be KVStore or str.")
    assert kv.is_capable(kvs.KVStoreBase.OPTIMIZER), \
        "KVStore with sparse weight requires optimizer support. " \
        "However, type(kv) does not support optimizer. " \
        "Please consider other kvstore backends (e.g. dist_device) instead."
    return (kv, True)

def _create_kvstore(kvstore, num_device, arg_params):
    """Create kvstore
    This function select and create a proper kvstore if given the kvstore type.

    Parameters
    ----------
    kvstore : KVStore or str
        The kvstore.
    num_device : int
        The number of devices
    arg_params : dict of str to `NDArray`.
        Model parameter, dict of name to `NDArray` of net's weights.
    """
    update_on_kvstore = bool(int(os.getenv('MXNET_UPDATE_ON_KVSTORE', "1")))
    if kvstore is None:
        kv = None
    elif isinstance(kvstore, kvs.KVStoreBase):
        kv = kvstore
    elif isinstance(kvstore, str):
        # create kvstore using the string type
        if num_device == 1 and 'dist' not in kvstore:
            # no need to use kv for single device and single machine
            kv = None
        else:
            kv = kvs.create(kvstore)
            if kvstore == 'local':
                # automatically select a proper local
                max_size = max(np.prod(param.shape) for param in
                               arg_params.values())
                if max_size > 1024 * 1024 * 16:
                    update_on_kvstore = False
    else:
        raise TypeError('kvstore must be KVStore, str or None')

    if kv is None:
        update_on_kvstore = False
    else:
        update_on_kvstore &= kv.is_capable(kvs.KVStoreBase.OPTIMIZER)

    return (kv, update_on_kvstore)

def _initialize_kvstore(kvstore, param_arrays, arg_params, param_names, update_on_kvstore):
    """Initialize kvstore"""
    for idx, param_on_devs in enumerate(param_arrays):
        name = param_names[idx]
        if not update_on_kvstore or arg_params[name].stype != 'default':
            kvstore.init(name, arg_params[name])
        else:
            kvstore.broadcast(name, arg_params[name], out=param_on_devs)

def _update_params_on_kvstore_nccl(param_arrays, grad_arrays, kvstore, param_names):
    """Perform update of param_arrays from grad_arrays on NCCL kvstore."""
    valid_indices = [index for index, grad_list in
                     enumerate(grad_arrays) if grad_list[0] is not None]
    valid_grad_arrays = [grad_arrays[i] for i in valid_indices]
    valid_param_arrays = [param_arrays[i] for i in valid_indices]
    valid_param_names = [param_names[i] for i in valid_indices]
    size = len(valid_grad_arrays)
    start = 0
    # Use aggregation by default only with NCCL
    default_batch = '16'
    batch = int(os.getenv('MXNET_UPDATE_AGGREGATION_SIZE', default_batch))
    while start < size:
        end = start + batch if start + batch < size else size
        # push gradient, priority is negative index
        # pull back the weights
        kvstore.pushpull(valid_param_names[start:end], valid_grad_arrays[start:end],
                         out=valid_param_arrays[start:end], priority=-start)
        start = end

def _update_params_on_kvstore(param_arrays, grad_arrays, kvstore, param_names):
    """Perform update of param_arrays from grad_arrays on kvstore."""
    for index, pair in enumerate(zip(param_arrays, grad_arrays)):
        arg_list, grad_list = pair
        if grad_list[0] is None:
            continue
        name = param_names[index]
        # push gradient, priority is negative index
        # pull back the weights
        if grad_list[0].stype == 'default' and arg_list[0].stype == 'default':
            kvstore.pushpull(name, grad_list, out=arg_list, priority=-index)
        else:
            kvstore.push(name, grad_list, priority=-index)
            kvstore.pull(name, out=arg_list, priority=-index)

def _update_params(param_arrays, grad_arrays, updater, num_device,
                   kvstore=None, param_names=None):
    """Perform update of param_arrays from grad_arrays not on kvstore."""
    updates = [[] for _ in range(num_device)]
    for i, pair in enumerate(zip(param_arrays, grad_arrays)):
        arg_list, grad_list = pair
        if grad_list[0] is None:
            continue
        index = i
        if kvstore:
            name = param_names[index]
            # push gradient, priority is negative index
            if grad_list[0].stype == 'default' and arg_list[0].stype == 'default':
                kvstore.pushpull(name, grad_list, priority=-index)
            else:
                kvstore.push(name, grad_list, priority=-index)
                kvstore.pull(name, out=grad_list, priority=-index)
        for k, p in enumerate(zip(arg_list, grad_list)):
            # faked an index here, to make optimizer create diff
            # state for the same index but on diff devs, TODO(mli)
            # use a better solution later
            w, g = p
            updates[k].append((index*num_device+k, g, w))
    for dev_updates in updates:
        # update params if param_arrays and grad_arrays are not empty
        if dev_updates:
            i, w, g = zip(*dev_updates)
            updater(i, w, g)


def save_checkpoint(prefix, epoch, symbol, arg_params, aux_params, remove_amp_cast=True):
    """Checkpoint the model data into file.

    Parameters
    ----------
    prefix : str
        Prefix of model name.
    epoch : int
        The epoch number of the model.
    symbol : Symbol
        The input Symbol.
    arg_params : dict of str to NDArray
        Model parameter, dict of name to NDArray of net's weights.
    aux_params : dict of str to NDArray
        Model parameter, dict of name to NDArray of net's auxiliary states.
    remove_amp_cast : bool, optional
        Whether to remove the amp_cast and amp_multicast operators, before saving the model.
    Notes
    -----
    - ``prefix-symbol.json`` will be saved for symbol.
    - ``prefix-epoch.params`` will be saved for parameters.
    """
    if symbol is not None:
        symbol.save(f'{prefix}-symbol.json', remove_amp_cast=remove_amp_cast)

    save_dict = {(f'arg:{k}') : v.as_in_context(cpu()) for k, v in arg_params.items()}
    save_dict.update({(f'aux:{k}') : v.as_in_context(cpu()) for k, v in aux_params.items()})
    param_name = f'{prefix}-{epoch:04}.params'
    nd.save(param_name, save_dict)
    logging.info('Saved checkpoint to "{}"'.format(param_name))


def load_params(prefix, epoch):
    """Load params from a file
    """
    save_dict = nd.load(f'{prefix}-{epoch:04}.params')
    arg_params = {}
    aux_params = {}
    if not save_dict:
        logging.warning("Params file '%s' is empty", f'{prefix}-{epoch:04}.params')
        return (arg_params, aux_params)
    for k, v in save_dict.items():
        tp, name = k.split(":", 1)
        if tp == "arg":
            arg_params[name] = v
        if tp == "aux":
            aux_params[name] = v
    return (arg_params, aux_params)

def load_checkpoint(prefix, epoch):
    """Load model checkpoint from file.

    Parameters
    ----------
    prefix : str
        Prefix of model name.
    epoch : int
        Epoch number of model we would like to load.

    Returns
    -------
    symbol : Symbol
        The symbol configuration of computation network.
    arg_params : dict of str to NDArray
        Model parameter, dict of name to NDArray of net's weights.
    aux_params : dict of str to NDArray
        Model parameter, dict of name to NDArray of net's auxiliary states.

    Notes
    -----
    - Symbol will be loaded from ``prefix-symbol.json``.
    - Parameters will be loaded from ``prefix-epoch.params``.
    """
    symbol = sym.load(f'{prefix}-symbol.json')
    arg_params, aux_params = load_params(prefix, epoch)
    return (symbol, arg_params, aux_params)