Optimizing a Tensor Flow ModelΒΆ

Optimizing the hyperparameters of a TensorFlow model is no harder than any other optimization. The only difficulty would be the multiple levels where hyperparameters are set. For example, the learning rate is set in the training function while the number of neurons in a given layer is set while constructing the model.

Let say we want to optimize the hyperparameters of a convolutional neural network over bunch of parameters including the activation function per layer, the number of neurons in each layer and even the number of layers. First, we need a function that builds the model.

import tensorflow as tf
from tensorflow import layers

def cnn_model(inputs, targets, dropout_keep_prob, params):
    num_output = int(targets.get_shape()[1])
    net = inputs

    # Get the number of convolution layers from the parameter set
    for i in range(0, params["num_conv_layers"]):
        with tf.variable_scope("conv_{}".format(i)):
            # Create layer using input parameters
            net = layers.conv2d(net,
                                filters=params["conv_{}_num_outputs".format(i)],
                                kernel_size=params["conv_{}_kernel_size".format(i)],
                                strides=1,
                                padding="SAME",
                                activation=params["conv_{}_activation_fn".format(i)])

            net = layers.conv2d(net,
                                filters=params["conv_{}_num_outputs".format(i)],
                                kernel_size=params["conv_{}_kernel_size".format(i)],
                                strides=1,
                                padding="SAME",
                                activation=params["conv_{}_activation_fn".format(i)])

        with tf.variable_scope("mp_{}".format(i)):
            net = layers.max_pooling2d(net,
                                       pool_size=params["mp_{}_kernel_size".format(i)],
                                       strides=1,
                                       padding="VALID")

    # Dropout keep probability is set a train time.
    net = tf.nn.dropout(net, keep_prob=dropout_keep_prob)
    net = tf.contrib.layers.flatten(net)

    # Get the number of fully connectec layers from the parameter set
    for i in range(params["num_fc_layers"]):
        with tf.variable_scope("fc_{}".format(i)):
            # Create layer using input parameters
            net = tf.contrib.layers.fully_connected(net, params["fc_{}_num_outputs".format(i)],
                                  activation_fn=params["fc_{}_activation_fn".format(i)])

            net = tf.nn.dropout(net, keep_prob=dropout_keep_prob)

    with tf.variable_scope("output_layer"):
        net = tf.contrib.layers.fully_connected(net, num_output, activation_fn=tf.identity)

    return net

Then, we need a function to train the model that also has parameters to optimize such as the learning rate, the decay rate and the dropout keep probability. (No, it is not the ideal train function, it is just a demo.)

def score_cnn(X, y, params):
    sess = tf.InteractiveSession()

    train_steps = 20
    num_classes = y.shape[1]

    X_ = tf.placeholder(tf.float32, shape=(None,) + X.shape[1:])
    y_ = tf.placeholder(tf.float32, shape=(None, num_classes))
    keep_prob_ = tf.placeholder(tf.float32)
    lr_ = tf.placeholder(tf.float32)

    logits = cnn_model(X_, y_, keep_prob_, params)

    loss_func = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y_))
    optimizer_func = tf.train.AdamOptimizer(lr_).minimize(loss_func)

    predict = tf.argmax(logits, 1)
    correct_prediction = tf.equal(predict, tf.argmax(y_, 1))

    init = tf.global_variables_initializer()
    sess.run(init)

    lr_init = params["initial_learning_rate"]
    lr_decay = params["decay_learning_rate"]
    decay_steps = params["decay_steps"]

    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2)

    with sess.as_default():
        for step in range(train_steps):
            lr = lr_init * lr_decay ** (step / decay_steps)
            for i in range(0, X_train.shape[0], 128):
                feed_dict = {lr_: lr, X_: X_train[i:i+128], y_: y_train[i:i+128],
                             keep_prob_: params["dropout_keep_prob"]}

            _, train_loss = sess.run([optimizer_func, loss_func], feed_dict=feed_dict)
        valid_loss = 0
        for i in range(0, X_valid.shape[0], 128):
            feed_dict = {X_: X_valid[i:i+128], y_: y_valid[i:i+128], keep_prob_: 1.0}
            valid_loss += sess.run([loss_func], feed_dict=feed_dict)[0]
        valid_loss = valid_loss / (X_valid.shape[0]//128)

    return {"loss" : valid_loss}

The flexibility of the last pieces of code comes at a price; the number of parameters to set in the search space is quite large. The next table summarizes all the parameters that needs to be set with their type

Model Type Training Type
num_conv_layers integer initial_learning_rate float
conv_{i}_num_outputs integer decay_learning_rate float
conv_{i}_kernel_size integer decay_steps integer
conv_{i}_activation_fn choice dropout_keep_prob float
mp_{i}_kernel_size integer    
num_fc_layers integer    
fc_{i}_num_outputs integer    
fc_{i}_activation_fn choice    

Since there are so many hyperparameters, lets just define a function that will creates the search space. The four training hyperparameters will sit a the top level of our space and the two defining the number of layers will constitute our conditions. All others will be set for these conditions.

import chocolate as choco

max_num_conv_layers = 8
max_num_fc_layers = 3

def create_space():
    space = {"initial_learning_rate" : choco.log(low=-5, high=-2, base=10),
             "decay_learning_rate" : choco.uniform(low=0.7, high=1.0),
             "decay_steps" : choco.quantized_log(low=2, high=4, step=1, base=10),
             "dropout_keep_prob" : choco.uniform(low=0.5, high=0.95)}

    num_conv_layer_cond = dict()
    for i in range(1, max_num_conv_layers):
        condition = dict()
        for j in range(i):
            condition["conv_{}_num_outputs".format(j)] = choco.quantized_log(low=3, high=10, step=1, base=2)
            condition["conv_{}_kernel_size".format(j)] = choco.quantized_uniform(low=1, high=7, step=1)
            condition["conv_{}_activation_fn".format(j)] = choco.choice([tf.nn.relu, tf.nn.elu, tf.nn.tanh])
            condition["mp_{}_kernel_size".format(j)] = choco.quantized_uniform(low=2, high=5, step=1)

        num_conv_layer_cond[i] = condition

    space["num_conv_layers"] = num_conv_layer_cond

    num_fc_layer_cond = dict()
    for i in range(1, max_num_fc_layers):
        condition = dict()
        for j in range(i):
            condition["fc_{}_num_outputs".format(j)] = choco.quantized_log(low=3, high=10, step=1, base=2)
            condition["fc_{}_activation_fn".format(j)] = choco.choice([tf.nn.relu, tf.nn.elu, tf.nn.tanh])

        num_fc_layer_cond[i] = condition
    space["num_fc_layers"] = num_fc_layer_cond

    return space

Guess how large is the largest conditional branch of this search space. It has 36 parameters. 36 parameters is quite a lot to optimize by hand. The entire tree has 124 parameters! That is why we built Chocolate.

Ho yeah, I forgot about the last bit of code. The one that does the trick.

if __name__ == "__main__":
    X, y = some_dataset()

    space = create_space()
    conn = choco.SQLiteConnection(url="sqlite:///db.db")
    sampler = choco.Bayes(conn, space, random_state=42, skip=0)

    token, params = sampler.next()
    loss = score_cnn(X, y, params)
    sampler.update(token, loss)

Nha, there was absolutly nothing new here compared to the last tutorials.