utils.py

import tensorflow as tf
from tensorflow.python import pywrap_tensorflow
from tensorflow.python.platform import tf_logging as logging
from tensorflow.python.training import saver as tf_saver
import time
import os


def write_results(acc, tag, params, fpath=None, results_dir='results'):
    if not fpath:
        if not os.path.exists(results_dir):
            os.makedirs(results_dir)
        fpath = os.path.join(results_dir, 'results.txt')
    with open(fpath, 'a') as f:
        line = 'Acc:{} Tag:{} Params: {} \n'.format(acc, tag, params)
        f.write(line)


def average_gradients(tower_grads):
    """Calculate the average gradient for each shared variable across all towers.
    Note that this function provides a synchronization point across all towers.
    Args:
      tower_grads: List of lists of (gradient, variable) tuples. The outer list
        is over individual gradients. The inner list is over the gradient
        calculation for each tower.
    Returns:
       List of pairs of (gradient, variable) where the gradient has been averaged
       across all towers.
    """
    average_grads = []
    for grad_and_vars in zip(*tower_grads):
        # Note that each grad_and_vars looks like the following:
        #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
        grads = []
        for g, _ in grad_and_vars:
            # Add 0 dimension to the gradients to represent the tower.
            expanded_g = tf.expand_dims(g, 0)

            # Append on a 'tower' dimension which we will average over below.
            grads.append(expanded_g)

        # Average over the 'tower' dimension.
        grad = tf.concat(axis=0, values=grads)
        grad = tf.reduce_mean(grad, 0)

        # Keep in mind that the Variables are redundant because they are shared
        # across towers. So .. we will just return the first tower's pointer to
        # the Variable.
        v = grad_and_vars[0][1]
        grad_and_var = (grad, v)
        average_grads.append(grad_and_var)
    return average_grads


def montage_tf(imgs, num_h, num_w):
    """Makes a montage of imgs that can be used in image_summaries.

    Args:
        imgs: Tensor of images
        num_h: Number of images per column
        num_w: Number of images per row

    Returns:
        A montage of num_h*num_w images
    """
    imgs = tf.unstack(imgs)
    img_rows = [None] * num_h
    for r in range(num_h):
        img_rows[r] = tf.concat(axis=1, values=imgs[r * num_w:(r + 1) * num_w])
    montage = tf.concat(axis=0, values=img_rows)
    return tf.expand_dims(montage, 0)


def remove_missing(var_list, model_path):
    reader = pywrap_tensorflow.NewCheckpointReader(model_path)
    if isinstance(var_list, dict):
        var_dict = var_list
    else:
        var_dict = {var.op.name: var for var in var_list}
    available_vars = {}
    for var in var_dict:

        if reader.has_tensor(var):
            available_vars[var] = var_dict[var]
        else:
            logging.warning(
                'Variable %s missing in checkpoint %s', var, model_path)
    var_list = available_vars
    return var_list


def assign_from_checkpoint_fn(model_path, var_list, ignore_missing_vars=False,
                              reshape_variables=False):
    """Returns a function that assigns specific variables from a checkpoint.

    Args:
        model_path: The full path to the model checkpoint. To get latest checkpoint
          use `model_path = tf.train.latest_checkpoint(checkpoint_dir)`
        var_list: A list of `Variable` objects or a dictionary mapping names in the
          checkpoint to the correspoing variables to initialize. If empty or None,
          it would return  no_op(), None.
        ignore_missing_vars: Boolean, if True it would ignore variables missing in
          the checkpoint with a warning instead of failing.
        reshape_variables: Boolean, if True it would automatically reshape variables
          which are of different shape then the ones stored in the checkpoint but
          which have the same number of elements.

    Returns:
        A function that takes a single argument, a `tf.Session`, that applies the
        assignment operation.

    Raises:
        ValueError: If the checkpoint specified at `model_path` is missing one of
                    the variables in `var_list`.
    """
    if ignore_missing_vars:
        var_list = remove_missing(var_list, model_path)

    saver = tf_saver.Saver(var_list, reshape=reshape_variables)

    def callback(session):
        saver.restore(session, model_path)

    return callback


def get_variables_to_train(trainable_scopes=None):
    """Returns a list of variables to train.
    Returns:
      A list of variables to train by the optimizer.
    """
    if trainable_scopes is None:
        variables_to_train = tf.trainable_variables()
    else:
        scopes = [scope.strip() for scope in trainable_scopes.split(',')]

        variables_to_train = []
        for scope in scopes:
            variables = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES, scope)
            variables_to_train.extend(variables)

    print('Variables to train: {}'.format([v.op.name for v in variables_to_train]))

    return variables_to_train


def get_checkpoint_path(checkpoint_dir):
    ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
    if not ckpt:
        print("No checkpoint in {}".format(checkpoint_dir))
        return None
    return ckpt.model_checkpoint_path


def wait_for_new_checkpoint(checkpoint_dir,
                            last_checkpoint=None,
                            seconds_to_sleep=1,
                            timeout=None):
    """Waits until a new checkpoint file is found.
    Args:
      checkpoint_dir: The directory in which checkpoints are saved.
      last_checkpoint: The last checkpoint path used or `None` if we're expecting
        a checkpoint for the first time.
      seconds_to_sleep: The number of seconds to sleep for before looking for a
        new checkpoint.
      timeout: The maximum amount of time to wait. If left as `None`, then the
        process will wait indefinitely.
    Returns:
      a new checkpoint path, or None if the timeout was reached.
    """
    logging.info('Waiting for new checkpoint at %s', checkpoint_dir)
    stop_time = time.time() + timeout if timeout is not None else None
    while True:
        ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
        if ckpt is None:
            checkpoint_path = None
        else:
            checkpoint_path = ckpt.model_checkpoint_path
            ckpt_id = checkpoint_path.split('/')[-1]
            checkpoint_path = os.path.join(checkpoint_dir, ckpt_id)

        # checkpoint_path = tf_saver.latest_checkpoint(checkpoint_dir)
        if checkpoint_path is None or checkpoint_path == last_checkpoint:
            if stop_time is not None and time.time() + seconds_to_sleep > stop_time:
                return None
            time.sleep(seconds_to_sleep)
        else:
            logging.info('Found new checkpoint at %s', checkpoint_path)

            return checkpoint_path