Tensorflow for Deep Learning Research

9 minute read

Published:

CS 20: Tensorflow for Deep Learning Research by Chip Huyen at Stanford University.

Overview of Tensorflow

To put part of a graph on a specific CPU or GPU:

# Creates a graph.
with tf.device('/gpu:2'):
  a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name='a')
  b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name='b')
  c = tf.multiply(a, b)

# Creates a session with log_device_placement set to True.
sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))

# Runs the op.
print(sess.run(c))

tf.Graph()

g = tf.Graph()

with g.as_default():
    x = tf.add(3, 5)

sess = tf.Session(graph=g)
with tf.Session() as sess:
    sess.run(x)

Multiple graphs

g1 = tf.get_default_graph()
g2 = tf.Graph()

# add ops to the default graph
with g1.as_default():
    a = tf.Constant(3)

# add ops to the user created graph
with g2.as_default():
    b = tf.Constant(5)

Operations

Visualize with TensorBoard

import tensorflow as tf

a = tf.constant(2)
b = tf.constant(3)
x = tf.add(a, b)

writer = tf.summary.FileWriter('./graphs', tf.get_default_graph())
with tf.Session() as sess:
    # writer = tf.summary.FileWriter('./graphs', sess.graph) 
    print(sess.run(x))
writer.close() # close the writer when you’re done using it

Run TensorBoard

$ python3 [yourprogram].py
$ tensorboard --logdir="./graphs" --port 6006

Variables

s = tf.get_variable("scalar", initializer=tf.constant(2)) 
m = tf.get_variable("matrix", initializer=tf.constant([[0, 1], [2, 3]]))
W = tf.get_variable("big_matrix", shape=(784, 10), initializer=tf.zeros_initializer())

Initialize variables

# The easiest way is initializing all variables at once:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

# Initialize only a subset of variables:
with tf.Session() as sess:
    sess.run(tf.variables_initializer([a, b]))

# Initialize a single variable
W = tf.Variable(tf.zeros([784,10]))
with tf.Session() as sess:
    sess.run(W.initializer)

Each session maintains its own copy of variables

W = tf.Variable(10)

sess1 = tf.Session()
sess2 = tf.Session()

sess1.run(W.initializer)
sess2.run(W.initializer)

print(sess1.run(W.assign_add(10))) 	# >> 20
print(sess2.run(W.assign_sub(2)))  # >> 8

print(sess1.run(W.assign_add(100))) # >> 120
print(sess2.run(W.assign_sub(50)))  # >> -42

sess1.close()
sess2.close()

Placeholders

# create a placeholder for a vector of 3 elements, type tf.float32
a = tf.placeholder(tf.float32, shape=[3])

b = tf.constant([5, 5, 5], tf.float32)

# use the placeholder as you would a constant or a variable
c = a + b  # short for tf.add(a, b)

with tf.Session() as sess:
    print(sess.run(c, feed_dict={a: [1, 2, 3]}))

# >> [6, 7, 8]

VERY BAD Lazy loading Example

x = tf.Variable(10, name='x')
y = tf.Variable(20, name='y')

writer = tf.summary.FileWriter('./graphs/normal_loading', tf.get_default_graph())
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for _ in range(10):
	    sess.run(tf.add(x, y)) # someone decides to be clever to save one line of code
writer.close()

Linear and Logistic Regression

Implementing Huber loss

$L_{\delta}(y, f(x)) = \begin{cases} \frac{1}{2}(y-f(x))^2, & \mbox{for } |y-f(x)|\le \delta, \newline \delta |y-f(x)| - \frac{1}{2}\delta^2, & \mbox{otherwise}. \end{cases}$

def huber_loss(labels, predictions, delta=14.0):
    residual = tf.abs(labels - predictions)
    def f1(): return 0.5 * tf.square(residual)
    def f2(): return delta * residual - 0.5 * tf.square(delta)
    return tf.cond(residual < delta, f1, f2)

tf.data.Dataset

tf.data.Dataset.from_tensor_slices((features, labels))
dataset = tf.data.Dataset.from_tensor_slices((data[:,0], data[:,1]))

print(dataset.output_types)    # >> (tf.float32, tf.float32)
print(dataset.output_shapes)   # >> (TensorShape([]), TensorShape([]))

Create Dataset from files

tf.data.TextLineDataset(filenames)
tf.data.FixedLengthRecordDataset(filenames)
tf.data.TFRecordDataset(filenames)

tf.data.Iterator

iterator = dataset.make_one_shot_iterator()
X, Y = iterator.get_next()         # X is the birth rate, Y is the life expectancy

with tf.Session() as sess:
    print(sess.run([X, Y]))		# >> [1.822, 74.82825]
    print(sess.run([X, Y]))		# >> [3.869, 70.81949]
    print(sess.run([X, Y]))		# >> [3.911, 72.15066]
    
 
    
iterator = dataset.make_initializable_iterator()
...
for i in range(100): 
    sess.run(iterator.initializer) 
    total_loss = 0
    try:
        while True:
            sess.run([optimizer]) 
    except tf.errors.OutOfRangeError:
        pass

Eager execution

Boilerplate

x = tf.placeholder(tf.float32, shape=[1, 1])
m = tf.matmul(x, x)

print(m)
# Tensor("MatMul:0", shape=(1, 1), dtype=float32)

with tf.Session() as sess:
  m_out = sess.run(m, feed_dict={x: [[2.]]})
print(m_out)
# [[4.]]

##########

x = [[2.]]  # No need for placeholders!
m = tf.matmul(x, x)

print(m)  # No sessions!
# tf.Tensor([[4.]], shape=(1, 1), dtype=float32)

Lazy Loading

x = tf.random_uniform([2, 2])

with tf.Session() as sess:
  for i in range(x.shape[0]):
    for j in range(x.shape[1]):
      print(sess.run(x[i, j]))
      
##########

x = tf.random_uniform([2, 2])

for i in range(x.shape[0]):
  for j in range(x.shape[1]):
    print(x[i, j])

Tensors Act Like NumPy Arrays

x = tf.constant([1.0, 2.0, 3.0])

# Tensors are backed by NumPy arrays
assert type(x.numpy()) == np.ndarray
squared = np.square(x) # Tensors are compatible with NumPy functions
 
# Tensors are iterable!
for i in x:
  print(i)

Gradients

def square(x):
  return x ** 2

grad = tfe.gradients_function(square)

print(square(3.))    # tf.Tensor(9., shape=(), dtype=float32)
print(grad(3.))      # [tf.Tensor(6., shape=(), dtype=float32)]


x = tfe.Variable(2.0)
def loss(y):
  return (y - x ** 2) ** 2

grad = tfe.implicit_gradients(loss)

print(loss(7.))  # tf.Tensor(9., shape=(), dtype=float32)
print(grad(7.))  # [(<tf.Tensor: -24.0, shape=(), dtype=float32>, <tf.Variable 'Variable:0', shape=() dtype=float32, numpy=2.0>)]

Variable sharing and managing experiments

Word Embedding

  1. CBOW: use neighbors to predict center
  2. Skip-Gram: use center to predict neighbors

Structure TensorFlow model

class SkipGramModel:
    """ Build the graph for word2vec model """
    def __init__(self, params):
        pass

    def _import_data(self):
        """ Step 1: import data """
        pass

    def _create_embedding(self):
        """ Step 2: define weights. In word2vec, it's actually the weights that we care about """
        pass

    def _create_loss(self):
        """ Step 3 + 4: define the inference + the loss function """
        pass

    def _create_optimizer(self):
        """ Step 5: define optimizer """
        pass

Name scope (TensorFlow doesn’t know what nodes should be grouped together, unless you tell it to)

with tf.name_scope(name_of_that_scope):
    # declare op_1
    # declare op_2
    # ...

Variable scope

tf.get_variable(<name>, <shape>, <initializer>)

If a variable with <name> already exists, reuse it
If not, initialize it with <shape> using <initializer>

##########

def fully_connected(x, output_dim, scope):
    with tf.variable_scope(scope, reuse=tf.AUTO_REUSE) as scope:
        w = tf.get_variable("weights", [x.shape[1], output_dim], initializer=tf.random_normal_initializer())
        b = tf.get_variable("biases", [output_dim], initializer=tf.constant_initializer(0.0))
        return tf.matmul(x, w) + b

def two_hidden_layers(x):
    h1 = fully_connected(x, 50, 'h1')
    h2 = fully_connected(h1, 10, 'h2')

with tf.variable_scope('two_layers') as scope:
    logits1 = two_hidden_layers(x1)
    logits2 = two_hidden_layers(x2)

tf.train.Saver

  1. Only save variables, not graph
  2. Checkpoints map variable names to tensors

tf.summary

tf.summary.scalar("loss", self.loss)
tf.summary.histogram("histogram loss", self.loss)
summary_op = tf.summary.merge_all()

saver = tf.train.Saver() # defaults to saving all variables

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    ckpt = tf.train.get_checkpoint_state(os.path.dirname('checkpoints/checkpoint'))
    if ckpt and ckpt.model_checkpoint_path:
        saver.restore(sess, ckpt.model_checkpoint_path)

    writer = tf.summary.FileWriter('./graphs', sess.graph)
    for index in range(10000):
        ...
        loss_batch, _, summary = sess.run([loss, optimizer, summary_op])
        writer.add_summary(summary, global_step=index)

        if (index + 1) % 1000 == 0:
            saver.save(sess, 'checkpoints/skip-gram', index)

Control randomization

tf.reset_default_graph()
tf.set_random_seed(2)
c = tf.random_uniform([], -10, 10)
d = tf.random_uniform([], -10, 10)

with tf.Session() as sess:
    print(sess.run(c)) >> -4.007516
    print(sess.run(d)) >> -2.9833937

with tf.Session() as sess:
    print(sess.run(c)) >> -4.007516
    print(sess.run(d)) >> -2.9833937

Introduction to ConvNet && Convnet in TensorFlow && Convolutional Neural Networks

Convolutional networks are tailor-made for computer vision tasks.

They exploit:

  1. Hierarchical nature of features
  2. Translation invariance of features

“Understanding” what a convnet learns is non-trivial, but some clever approaches exist.

TFRecord:

  1. The recommended format for TensorFlow
  2. Binary file format (a serialized tf.train.Example protobuf object)
  3. make better use of disk cache
  4. faster to move around
  5. can handle data of different types

Convert to TFRecord format

# Step 1: create a writer to write tfrecord to that file
writer = tf.python_io.TFRecordWriter(out_file)

# Step 2: get serialized shape and values of the image
shape, binary_image = get_image_binary(image_file)

# Step 3: create a tf.train.Features object
features = tf.train.Features(feature={'label': _int64_feature(label),
                                      'shape': _bytes_feature(shape),
                                      'image': _bytes_feature(binary_image)})

# Step 4: create a sample containing of features defined above
sample = tf.train.Example(features=features)

# Step 5: write the sample to the tfrecord file
writer.write(sample.SerializeToString())
writer.close()


def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

Read TFRecord

dataset = tf.data.TFRecordDataset(tfrecord_files)
dataset = dataset.map(_parse_function)

def _parse_function(tfrecord_serialized):
    features={'label': tf.FixedLenFeature([], tf.int64),
              'shape': tf.FixedLenFeature([], tf.string),
              'image': tf.FixedLenFeature([], tf.string)}

    parsed_features = tf.parse_single_example(tfrecord_serialized, features)

    return parsed_features['label'], parsed_features['shape'], parsed_features['image']

Variational Auto-Encoders

Recurrent Neural Networks

Construct Cells

cell = tf.nn.rnn_cell.GRUCell(hidden_size)

Stack Multiple Cells

layers = [tf.nn.rnn_cell.GRUCell(size) for size in hidden_sizes]
cells = tf.nn.rnn_cell.MultiRNNCell(layers)
output, out_state = tf.nn.dynamic_rnn(cell, seq, length, initial_state)

Dealing with variable sequence length

# Approach 1:
# Maintain a mask (True for real, False for padded tokens)

full_loss = tf.nn.softmax_cross_entropy_with_logits(preds, labels)
loss = tf.reduce_mean(tf.boolean_mask(full_loss, mask))


# Approach 2: 
# Let your model know the real sequence length so it only predict the labels for the real tokens

cell = tf.nn.rnn_cell.GRUCell(hidden_size)
rnn_cells = tf.nn.rnn_cell.MultiRNNCell([cell] * num_layers)
tf.reduce_sum(tf.reduce_max(tf.sign(seq), 2), 1)
output, out_state = tf.nn.dynamic_rnn(cell, seq, length, initial_state)

Clip gradients with tf.clip_by_global_norm

gradients = tf.gradients(cost, tf.trainable_variables())
# take gradients of cost w.r.t. all trainable variables

clipped_gradients, _ = tf.clip_by_global_norm(gradients, max_grad_norm)
# clip the gradients by a pre-defined max norm 

optimizer = tf.train.AdamOptimizer(learning_rate)
train_op = optimizer.apply_gradients(zip(gradients, trainables))
# add the clipped gradients to the optimizer

Seq2seq with Attention

Beam search: On each step of decoder, keep track of the k most probable partial translations

  1. k is the beam size (in practice around 5 to 10)
  2. Not guaranteed to find optimal solution
  3. But much more efficient

BLEU (Bilingual Evaluation Understudy): BLEU compares the machine-written translation to one or several human-written translation(s), and computes a similarity score based on:

  1. n-gram precision (usually up to 3 or 4-grams)
  2. Penalty for too-short system translations

BLEU is useful but imperfect:

  1. There are many valid ways to translate a sentence
  2. So a good translation can get a poor BLEU score because it has low n-gram overlap with the human translation

Attention: it provides a solution to the bottleneck problem

core idea: on each step of the decoder, focus on a particular part of the source sequence

Beyond RNNs: Transformer, Tensor2Tensor

Dialogue Agents

Reinforcement Learning in Tensorflow

Keras