Skip to content

Commit

Permalink
Add optimization_flop.py
Browse files Browse the repository at this point in the history
  • Loading branch information
Arexh committed Apr 28, 2020
1 parent 984c36b commit 063d6ae
Show file tree
Hide file tree
Showing 6 changed files with 288 additions and 98 deletions.
5 changes: 2 additions & 3 deletions flop/factorize.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from tensorflow.python import pywrap_tensorflow
import tensorflow as tf
import numpy as np
import factorize
import copy
import os
import re
Expand Down Expand Up @@ -43,13 +42,13 @@ def save_factorized_model(bert_config_file, init_checkpoint, output_dir):
tvar_names.append(var.name)
for key in var_to_shape_map:
if re.match(bias_pattern, key):
q = factorize.bias_map(key)
q = bias_map(key)
q_var = [v for v in tvar if v.name == q][0]
tf.logging.info("Tensor: %s %s", q, "*INIT_FROM_CKPT*")
sess.run(tf.assign(q_var, reader.get_tensor(key)))
tvar_names.remove(q)
elif re.match(kernel_pattern, key):
p, q = factorize.kernel_map(key)
p, q = kernel_map(key)
p_var = [v for v in tvar if v.name == p][0]
q_var = [v for v in tvar if v.name == q][0]
u, s, v = np.linalg.svd(reader.get_tensor(key))
Expand Down
66 changes: 18 additions & 48 deletions flop/layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,12 @@
from __future__ import print_function

import tensorflow as tf
import math

import common
import nn
from tensorflow.python.layers import base # pylint: disable=g-direct-tensorflow-import
from tensorflow.contrib.layers.python.layers import utils as layer_utils
# from tensorflow.contrib.layers.python.layers import utils as layer_utils
from tensorflow.python.ops import variables as tf_variables # pylint: disable=g-direct-tensorflow-import


Expand All @@ -35,13 +36,8 @@ class FlopFullyConnected(base.Layer):
"""Base implementation of a fully connected layer with FLOP.
Args:
x: Input, float32 tensor.
num_outputs: Int representing size of output tensor.
activation: If None, a linear activation is used.
bias_initializer: Initalizer of the bias vector.
bias_regularizer: Optional regularizer for the bias vector.
log_alpha_initializer: Specified initializer of the log_alpha term.
is_training: Boolean specifying whether it is training or eval.
use_bias: Boolean specifying whether bias vector should be used.
eps: Small epsilon value to prevent math op saturation.
beta: The beta parameter, which controls the "temperature" of
the distribution. Defaults to 2/3 from the above paper.
Expand All @@ -55,33 +51,25 @@ class FlopFullyConnected(base.Layer):
"""

def __init__(self,
num_outputs,
activation,
bias_initializer,
bias_regularizer,
log_alpha_initializer,
activity_regularizer=None,
is_training=True,
trainable=True,
use_bias=True,
eps=common.EPSILON,
beta=common.BETA,
limit_l=common.LIMIT_L,
limit_r=common.LIMIT_R,
init_mean=0.5,
init_std=0.01,
eps=1e-6,
beta=1.0,
limit_l=-0.1,
limit_r=1.1,
name="flop_mask",
**kwargs):
super(FlopFullyConnected, self).__init__(
trainable=trainable,
name=name,
activity_regularizer=activity_regularizer,
**kwargs)
self.num_outputs = num_outputs
self.activation = activation
self.bias_initializer = bias_initializer
self.bias_regularizer = bias_regularizer
self.log_alpha_initializer = log_alpha_initializer
self.is_training = is_training
self.use_bias = use_bias
self.init_mean = init_mean
self.init_std = init_std
self.eps = eps
self.beta = beta
self.limit_l = limit_l
Expand All @@ -90,37 +78,23 @@ def __init__(self,
def build(self, input_shape):
input_shape = input_shape.as_list()

assert input_shape[0] == input_shape[1]

input_hidden_size = input_shape[1]
diag_size = input_shape[0]

if not self.log_alpha_initializer:
# default log alpha set s.t. \alpha / (\alpha + 1) = .1
self.log_alpha_initializer = tf.random_normal_initializer(
mean=2.197, stddev=0.01, dtype=self.dtype)
mean = math.log(1 - self.init_mean) - math.log(self.init_mean)
self.log_alpha_initializer = tf.random_normal_initializer(
mean=mean, stddev=self.init_std, dtype=self.dtype)

self.log_alpha = tf.get_variable(
"log_alpha",
shape=diag_size,
shape=input_hidden_size,
initializer=self.log_alpha_initializer,
dtype=self.dtype,
trainable=True)

layer_utils.add_variable_to_collection(
self.log_alpha,
[THETA_LOGALPHA_COLLECTION], None)

if self.use_bias:
self.bias = self.add_variable(
name="bias",
shape=(self.num_outputs,),
initializer=self.bias_initializer,
regularizer=self.bias_regularizer,
trainable=True,
dtype=self.dtype)
else:
self.bias = None
# layer_utils.add_variable_to_collection(
# self.log_alpha,
# [THETA_LOGALPHA_COLLECTION], None)

self.built = True

def call(self, inputs):
Expand All @@ -139,10 +113,6 @@ def call(self, inputs):
limit_l=self.limit_l,
limit_r=self.limit_r)

if self.use_bias:
x = tf.nn.bias_add(x, self.bias)
if self.activation is not None:
return self.activation(x)
return x


Expand Down
126 changes: 90 additions & 36 deletions flop/modeling_flop.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from modeling import *
import layers


class BertModelHardConcrete(BertModel):
Expand Down Expand Up @@ -78,7 +79,7 @@ def __init__(self,

# Run the stacked transformer.
# `sequence_output` shape = [batch_size, seq_length, hidden_size].
self.all_encoder_layers = transformer_model_train(
self.all_encoder_layers = transformer_model_flop(
input_tensor=self.embedding_output,
attention_mask=attention_mask,
hidden_size=config.hidden_size,
Expand All @@ -89,7 +90,8 @@ def __init__(self,
hidden_dropout_prob=config.hidden_dropout_prob,
attention_probs_dropout_prob=config.attention_probs_dropout_prob,
initializer_range=config.initializer_range,
do_return_all_layers=True)
do_return_all_layers=True,
is_training=is_training)

self.sequence_output = self.all_encoder_layers[-1]
# The "pooler" converts the encoded sequence tensor of shape
Expand All @@ -109,20 +111,21 @@ def __init__(self,
kernel_initializer=create_initializer(config.initializer_range))


def attention_layer_train(from_tensor,
to_tensor,
attention_mask=None,
num_attention_heads=1,
size_per_head=512,
query_act=None,
key_act=None,
value_act=None,
attention_probs_dropout_prob=0.0,
initializer_range=0.02,
do_return_2d_tensor=False,
batch_size=None,
from_seq_length=None,
to_seq_length=None):
def attention_layer_flop(from_tensor,
to_tensor,
attention_mask=None,
num_attention_heads=1,
size_per_head=512,
query_act=None,
key_act=None,
value_act=None,
attention_probs_dropout_prob=0.0,
initializer_range=0.02,
do_return_2d_tensor=False,
batch_size=None,
from_seq_length=None,
to_seq_length=None,
is_training=True):

def transpose_for_scores(input_tensor, batch_size, num_attention_heads,
seq_length, width):
Expand Down Expand Up @@ -169,9 +172,16 @@ def transpose_for_scores(input_tensor, batch_size, num_attention_heads,
name="query_p",
kernel_initializer=create_initializer(initializer_range))

# Attention: log_alpha_initializer, eps, beta, limit_l, limit_r!
query_layer_mask = layers.FlopFullyConnected(
name="query_g",
log_alpha_initializer=None,
is_training=is_training)

query_layer_mask_output = query_layer_mask(query_layer_p)

query_layer = tf.layers.dense(
query_layer_p,
query_layer_mask_output,
num_attention_heads * size_per_head,
activation=query_act,
name="query_q",
Expand All @@ -194,8 +204,16 @@ def transpose_for_scores(input_tensor, batch_size, num_attention_heads,
name="key_p",
kernel_initializer=create_initializer(initializer_range))

# Attention: log_alpha_initializer, eps, beta, limit_l, limit_r!
key_layer_mask = layers.FlopFullyConnected(
name="key_g",
log_alpha_initializer=None,
is_training=is_training)

key_layer_mask_output = key_layer_mask(key_layer_p)

key_layer = tf.layers.dense(
key_layer_p,
key_layer_mask_output,
num_attention_heads * size_per_head,
activation=key_act,
name="key_q",
Expand All @@ -218,8 +236,16 @@ def transpose_for_scores(input_tensor, batch_size, num_attention_heads,
name="value_p",
kernel_initializer=create_initializer(initializer_range))

# Attention: log_alpha_initializer, eps, beta, limit_l, limit_r!
value_layer_mask = layers.FlopFullyConnected(
name="value_g",
log_alpha_initializer=None,
is_training=is_training)

value_layer_mask_output = value_layer_mask(value_layer_p)

value_layer = tf.layers.dense(
value_layer_p,
value_layer_mask_output,
num_attention_heads * size_per_head,
activation=value_act,
name="value_q",
Expand Down Expand Up @@ -298,17 +324,18 @@ def transpose_for_scores(input_tensor, batch_size, num_attention_heads,
return context_layer


def transformer_model_train(input_tensor,
attention_mask=None,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
intermediate_act_fn=gelu,
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
initializer_range=0.02,
do_return_all_layers=False):
def transformer_model_flop(input_tensor,
attention_mask=None,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
intermediate_act_fn=gelu,
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
initializer_range=0.02,
do_return_all_layers=False,
is_training=True):
if hidden_size % num_attention_heads != 0:
raise ValueError(
"The hidden size (%d) is not a multiple of the number of attention "
Expand Down Expand Up @@ -340,7 +367,7 @@ def transformer_model_train(input_tensor,
with tf.variable_scope("attention"):
attention_heads = []
with tf.variable_scope("self"):
attention_head = attention_layer_train(
attention_head = attention_layer_flop(
from_tensor=layer_input,
to_tensor=layer_input,
attention_mask=attention_mask,
Expand All @@ -351,7 +378,8 @@ def transformer_model_train(input_tensor,
do_return_2d_tensor=True,
batch_size=batch_size,
from_seq_length=seq_length,
to_seq_length=seq_length)
to_seq_length=seq_length,
is_training=is_training)
attention_heads.append(attention_head)

attention_output = None
Expand All @@ -373,8 +401,17 @@ def transformer_model_train(input_tensor,
name="dense_p",
kernel_initializer=create_initializer(initializer_range))

# Attention: log_alpha_initializer, eps, beta, limit_l, limit_r!
attention_output_mask = layers.FlopFullyConnected(
name="dense_g",
log_alpha_initializer=None,
is_training=is_training)

attention_output_mask_output = attention_output_mask(
attention_output_p)

attention_output = tf.layers.dense(
attention_output_p,
attention_output_mask_output,
hidden_size,
name="dense_q",
kernel_initializer=create_initializer(initializer_range))
Expand All @@ -400,8 +437,17 @@ def transformer_model_train(input_tensor,
name='dense_p',
kernel_initializer=create_initializer(initializer_range))

# Attention: log_alpha_initializer, eps, beta, limit_l, limit_r!
intermediate_output_mask = layers.FlopFullyConnected(
name="dense_g",
log_alpha_initializer=None,
is_training=is_training)

intermediate_output_mask_output = intermediate_output_mask(
intermediate_output_p)

intermediate_output = tf.layers.dense(
intermediate_output_p,
intermediate_output_mask_output,
intermediate_size,
activation=intermediate_act_fn,
name='dense_q',
Expand All @@ -423,8 +469,16 @@ def transformer_model_train(input_tensor,
name="dense_p",
kernel_initializer=create_initializer(initializer_range))

# Attention: log_alpha_initializer, eps, beta, limit_l, limit_r!
layer_output_mask = layers.FlopFullyConnected(
name="dense_g",
log_alpha_initializer=None,
is_training=is_training)

layer_output_mask_output = layer_output_mask(layer_output_p)

layer_output = tf.layers.dense(
layer_output_p,
layer_output_mask_output,
hidden_size,
name="dense_q",
kernel_initializer=create_initializer(initializer_range))
Expand Down Expand Up @@ -530,7 +584,7 @@ def model_fn(features, labels, mode, params): # pylint: disable=unused-argument

is_training = (mode == tf.estimator.ModeKeys.TRAIN)

(total_loss, per_example_loss, logits, probabilities) = create_model(
(total_loss, per_example_loss, logits, probabilities) = create_model_train(
bert_config, is_training, input_ids, input_mask, segment_ids, label_ids,
num_labels, use_one_hot_embeddings)

Expand Down
3 changes: 1 addition & 2 deletions flop/nn.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,7 @@

import tensorflow as tf

from state_of_sparsity.layers.l0_regularization import common
from state_of_sparsity.layers.utils import layer_utils
import common


def matmul_train(
Expand Down
Loading

0 comments on commit 063d6ae

Please sign in to comment.