You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
401 lines
20 KiB
401 lines
20 KiB
3 years ago
|
# solvers.py
|
||
|
# ----------
|
||
|
# Licensing Information: You are free to use or extend these projects for
|
||
|
# educational purposes provided that (1) you do not distribute or publish
|
||
|
# solutions, (2) you retain this notice, and (3) you provide clear
|
||
|
# attribution to UC Berkeley, including a link to http://ai.berkeley.edu.
|
||
|
#
|
||
|
# Attribution Information: The Pacman AI projects were developed at UC Berkeley.
|
||
|
# The core projects and autograders were primarily created by John DeNero
|
||
|
# (denero@cs.berkeley.edu) and Dan Klein (klein@cs.berkeley.edu).
|
||
|
# Student side autograding was added by Brad Miller, Nick Hay, and
|
||
|
# Pieter Abbeel (pabbeel@cs.berkeley.edu).
|
||
|
|
||
|
|
||
|
import numpy as np
|
||
|
import tensorflow as tf
|
||
|
import tensorflow_util as tfu
|
||
|
from tensorflow_util import MinibatchIndefinitelyGenerator
|
||
|
import plotUtil
|
||
|
import util
|
||
|
|
||
|
|
||
|
def categorical_crossentropy(predictions, targets):
|
||
|
return tf.reduce_mean(-tf.reduce_sum(targets * tf.log(tf.clip_by_value(predictions, 1e-10, float('inf'))), reduction_indices=[1]))
|
||
|
|
||
|
def squared_error(predictions, targets):
|
||
|
return tf.reduce_mean(tf.reduce_sum(tf.square(targets - predictions), reduction_indices=[1]))
|
||
|
|
||
|
|
||
|
class Solver(object):
|
||
|
"""
|
||
|
Solver abstract class.
|
||
|
"""
|
||
|
def solve(self, input_train_data, target_train_data, input_val_data, target_val_data, model, callback=None):
|
||
|
raise NotImplementedError
|
||
|
|
||
|
|
||
|
class GradientDescentSolver(Solver):
|
||
|
def __init__(self, learning_rate, iterations, momentum=0, weight_decay=1e-3, loss_function=None, plot=0):
|
||
|
"""
|
||
|
Gradient descent solver for optimizing a model given some data.
|
||
|
|
||
|
Args:
|
||
|
learning_rate: also known as alpha. Used for parameter updates.
|
||
|
iterations: number of gradient steps (i.e. updates) to perform when
|
||
|
solving a model.
|
||
|
momentum: also known as mu. Used for velocity updates.
|
||
|
weight_decay: coefficient for l2 regularization on the loss.
|
||
|
loss_function: loss function to use for the objective being optimized.
|
||
|
plot: whether to show a plot of the loss for every iteration.
|
||
|
"""
|
||
|
if learning_rate < 0:
|
||
|
raise ValueError('learning_rate should be a non-negative number, %r given' % learning_rate)
|
||
|
self.learning_rate = learning_rate
|
||
|
if not isinstance(iterations, int) or iterations < 0:
|
||
|
raise ValueError('iterations should be a non-negative integer, %d given' % iterations)
|
||
|
self.iterations = iterations
|
||
|
if not (0 <= momentum <= 1):
|
||
|
raise ValueError('momentum should be between 0 and 1 (inclusive), %r given' % momentum)
|
||
|
self.momentum = momentum
|
||
|
if weight_decay < 0:
|
||
|
raise ValueError('weight_decay should be a non-negative number, %r given' % weight_decay)
|
||
|
self.weight_decay = weight_decay
|
||
|
self.loss_function = loss_function or categorical_crossentropy
|
||
|
self.plot = plot
|
||
|
|
||
|
def get_updates_without_momentum(self, loss_tensor, param_vars):
|
||
|
"""
|
||
|
Question 4: Returns the gradient descent updates when no momentum is used.
|
||
|
|
||
|
Args:
|
||
|
loss_tensor: loss tensor used to compute the gradients.
|
||
|
param_vars: list of parameter variables.
|
||
|
|
||
|
Returns:
|
||
|
A list of tuples, where each tuple is an update of the form
|
||
|
(param_var, new_param_tensor) indicating that, at runtime, the
|
||
|
parameter param_var should be updated with new_param_tensor.
|
||
|
|
||
|
You implementation should use the gradient tensors (provided below)
|
||
|
and the member variable self.learning_rate.
|
||
|
"""
|
||
|
grad_tensors = tf.gradients(loss_tensor, param_vars)
|
||
|
updates = []
|
||
|
"*** YOUR CODE HERE ***"
|
||
|
for i in range(len(param_vars)):
|
||
|
param_var = param_vars[i]
|
||
|
new_param_tensor = param_var - self.learning_rate * grad_tensors[i]
|
||
|
updates.append((param_var, new_param_tensor))
|
||
|
return updates
|
||
|
|
||
|
def get_updates_with_momentum(self, loss_tensor, param_vars):
|
||
|
"""
|
||
|
Question 5: Returns the gradient descent updates when momentum is used.
|
||
|
|
||
|
Args:
|
||
|
loss_tensor: loss tensor used to compute the gradients.
|
||
|
param_vars: list of parameter variables.
|
||
|
|
||
|
Returns:
|
||
|
A list of tuples, where each tuple is an update of the form
|
||
|
(var, new_tensor) indicating that, at runtime, the variable var
|
||
|
should be updated with new_tensor.
|
||
|
|
||
|
You implementation should use the gradient tensors and the velocity
|
||
|
variables (both provided below), and the member variables
|
||
|
self.learning_rate and self.momentum.
|
||
|
"""
|
||
|
grad_tensors = tf.gradients(loss_tensor, param_vars)
|
||
|
vel_vars = [tf.Variable(np.zeros(param_var.get_shape(), dtype=np.float32)) for param_var in param_vars]
|
||
|
tfu.get_session().run([vel_var.initializer for vel_var in vel_vars])
|
||
|
updates = []
|
||
|
"*** YOUR CODE HERE ***"
|
||
|
for i in range(len(param_vars)):
|
||
|
new_vel = self.momentum * vel_vars[i] - self.learning_rate * grad_tensors[i]
|
||
|
updates.append((vel_vars[i], new_vel))
|
||
|
new_tensor = param_vars[i] + new_vel
|
||
|
updates.append((param_vars[i], new_tensor))
|
||
|
return updates
|
||
|
|
||
|
def get_loss_tensor(self, prediction_tensor, target_ph, param_vars):
|
||
|
loss_tensor = self.loss_function(prediction_tensor, target_ph)
|
||
|
loss_tensor += self.weight_decay * sum(tf.nn.l2_loss(param_var) for param_var in param_vars)
|
||
|
return loss_tensor
|
||
|
|
||
|
def get_updates(self, loss_tensor, param_vars):
|
||
|
"""
|
||
|
Returns the gradient descent updates.
|
||
|
|
||
|
Args:
|
||
|
loss_tensor: loss tensor used to compute the gradients.
|
||
|
param_vars: list of parameter variables.
|
||
|
|
||
|
Returns:
|
||
|
A list of tuples, where each tuple is an update of the form
|
||
|
(var, new_tensor) indicating that, at runtime, the variable var
|
||
|
should be updated with new_tensor.
|
||
|
"""
|
||
|
if self.momentum == 0:
|
||
|
return self.get_updates_without_momentum(loss_tensor, param_vars)
|
||
|
else:
|
||
|
return self.get_updates_with_momentum(loss_tensor, param_vars)
|
||
|
|
||
|
def solve(self, input_train_data, target_train_data, input_val_data, target_val_data, model, callback=None):
|
||
|
"""
|
||
|
Question 6.a: Optimize the model and return the intermediate losses.
|
||
|
|
||
|
Optimize the model using gradient descent by running the variable
|
||
|
updates for self.iterations iterations.
|
||
|
|
||
|
Args:
|
||
|
input_train_data: a numpy.array with shape (N, R)
|
||
|
target_train_data: a numpy.array with shape (N, S)
|
||
|
input_val_data: a numpy.array with shape (M, R)
|
||
|
target_val_data: a numpy.array with shape (M, S)
|
||
|
model: the model from which the parameters are optimized
|
||
|
|
||
|
Returns:
|
||
|
A tuple of lists, where the first list contains the training loss of
|
||
|
each iteration and the second list contains the validation loss of
|
||
|
each iteration.
|
||
|
|
||
|
N and M are the numbers of training points, respectively, and R and S
|
||
|
are the dimensions for each input and target data point, respectively.
|
||
|
|
||
|
You may not need to fill in both "*** YOUR CODE HERE ***" blanks,
|
||
|
but they are both provided so you can define variables outside and
|
||
|
inside the for loop.
|
||
|
|
||
|
Useful method:
|
||
|
session.run
|
||
|
"""
|
||
|
session = tfu.get_session()
|
||
|
target_ph = tf.placeholder(tf.float32, shape=(None,) + target_train_data.shape[1:])
|
||
|
placeholders = [model.input_ph, target_ph]
|
||
|
train_data = [input_train_data, target_train_data]
|
||
|
val_data = [input_val_data, target_val_data]
|
||
|
# You may want to initialize some variables that are shared across iterations
|
||
|
"*** YOUR CODE HERE ***"
|
||
|
loss_tensor = self.get_loss_tensor(model.prediction_tensor, target_ph, model.get_param_vars(regularizable=True))
|
||
|
updates = self.get_updates(loss_tensor, model.get_param_vars(trainable=True))
|
||
|
update_ops = [tf.assign(old_var, new_var_or_tensor) for (old_var, new_var_or_tensor) in updates]
|
||
|
train_losses = []
|
||
|
val_losses = []
|
||
|
for iter_ in range(self.iterations):
|
||
|
"*** YOUR CODE HERE ***"
|
||
|
# train_loss should be the loss of this iteration using all of the training data
|
||
|
# val_loss should be the loss of this iteration using all of the validation data
|
||
|
train_loss = session.run(loss_tensor, feed_dict = {model.input_ph: input_train_data, target_ph: target_train_data})
|
||
|
session.run(update_ops, feed_dict = {model.input_ph: input_train_data, target_ph: target_train_data})
|
||
|
val_loss = session.run(loss_tensor, feed_dict = {model.input_ph: input_val_data, target_ph: target_val_data})
|
||
|
|
||
|
train_losses.append(train_loss)
|
||
|
val_losses.append(val_loss)
|
||
|
if callback is not None: callback(model)
|
||
|
self.display_progress(iter_, train_losses, val_losses)
|
||
|
return train_losses, val_losses
|
||
|
|
||
|
def display_progress(self, iter_, train_losses, val_losses):
|
||
|
print("Iteration {} of {}".format(iter_, self.iterations))
|
||
|
print(" training loss = {:.6f}".format(train_losses[-1]))
|
||
|
print(" validation loss = {:.6f}".format(val_losses[-1]))
|
||
|
if self.plot and iter_ % self.plot == 0:
|
||
|
plotUtil.plotTwoCurves(range(len(train_losses)), train_losses,
|
||
|
range(len(val_losses)), val_losses,
|
||
|
label1='training loss',
|
||
|
label2='validation loss', showLegend=True,
|
||
|
figureIdx=2,
|
||
|
figureTitle="%s: Training and Validation Loss" % self.__class__.__name__)
|
||
|
|
||
|
|
||
|
class StochasticGradientDescentSolver(GradientDescentSolver):
|
||
|
def __init__(self, learning_rate, iterations, momentum=0, weight_decay=1e-3, shuffle=None, loss_function=None, plot=0):
|
||
|
"""
|
||
|
Stochastic gradient descent solver for optimizing a model given some data.
|
||
|
|
||
|
Args:
|
||
|
learning_rate: also known as alpha. Used for parameter updates.
|
||
|
iterations: number of gradient steps (i.e. updates) to perform when
|
||
|
solving a model.
|
||
|
momentum: also known as mu. Used for velocity updates.
|
||
|
weight_decay: coefficient for l2 regularization on the loss.
|
||
|
shuffle: whether the order of the data points should be randomized
|
||
|
when iterating over the data
|
||
|
loss_function: loss function to use for the objective being optimized.
|
||
|
plot: whether to show a plot of the loss for every iteration.
|
||
|
"""
|
||
|
super(StochasticGradientDescentSolver, self).__init__(
|
||
|
learning_rate, iterations, momentum=momentum, weight_decay=weight_decay, loss_function=loss_function, plot=plot)
|
||
|
self.shuffle = True if shuffle is None else shuffle
|
||
|
|
||
|
def solve(self, input_train_data, target_train_data, input_val_data, target_val_data, model, callback=None):
|
||
|
"""
|
||
|
Question 6.b: Optimize the model and return the intermediate losses.
|
||
|
|
||
|
Optimize the model using stochastic gradient descent by running the
|
||
|
variable updates for self.iterations iterations.
|
||
|
|
||
|
Args:
|
||
|
input_train_data: a numpy.array with shape (N, R)
|
||
|
target_train_data: a numpy.array with shape (N, S)
|
||
|
input_val_data: a numpy.array with shape (M, R)
|
||
|
target_val_data: a numpy.array with shape (M, S)
|
||
|
model: the model from which the parameters are optimized
|
||
|
|
||
|
Returns:
|
||
|
A tuple of lists, where the first list contains the training loss of
|
||
|
each iteration and the second list contains the validation loss of
|
||
|
each iteration. The validation loss should be computed using the
|
||
|
same amount of data as the training loss, but using the validation
|
||
|
data.
|
||
|
|
||
|
N and M are the numbers of training points, respectively, and R and S
|
||
|
are the dimensions for each input and target data point, respectively.
|
||
|
|
||
|
In here, the gradient descent is stochastic, meaning that you don't
|
||
|
need to use all the data at once before you update the model
|
||
|
parameters. Instead, you update the model parameters as you iterate
|
||
|
over the data. You must use MinibatchIndefinitelyGenerator to iterate
|
||
|
over the data, otherwise your solution might differ from the one of
|
||
|
the autograder. You will need to instantiate two generators (one for
|
||
|
the training data and another one for the validation data) and you
|
||
|
should do it before the for loop. You should read the docstring of
|
||
|
MinibatchIndefinitelyGenerator in tensorflow_util.py to figure out
|
||
|
how to use it. Make sure to pass in self.shuffle when you instantiate
|
||
|
the generator. You will have to choose a proper batch size too.
|
||
|
|
||
|
Useful member variables and methods:
|
||
|
self.shuffle
|
||
|
session.run(...)
|
||
|
generator.next()
|
||
|
"""
|
||
|
session = tfu.get_session()
|
||
|
target_ph = tf.placeholder(tf.float32, shape=(None,) + target_train_data.shape[1:])
|
||
|
placeholders = [model.input_ph, target_ph]
|
||
|
train_data = [input_train_data, target_train_data]
|
||
|
val_data = [input_val_data, target_val_data]
|
||
|
# You may want to initialize some variables that are shared across iterations
|
||
|
"*** YOUR CODE HERE ***"
|
||
|
train_gen = MinibatchIndefinitelyGenerator(train_data, 1, self.shuffle)
|
||
|
val_gen = MinibatchIndefinitelyGenerator(val_data, 1, self.shuffle)
|
||
|
|
||
|
loss_tensor = self.get_loss_tensor(model.prediction_tensor, target_ph, model.get_param_vars(regularizable=True))
|
||
|
updates = self.get_updates(loss_tensor, model.get_param_vars(trainable=True))
|
||
|
update_ops = [tf.assign(old_var, new_var_or_tensor) for (old_var, new_var_or_tensor) in updates]
|
||
|
train_losses = []
|
||
|
val_losses = []
|
||
|
for iter_ in range(self.iterations):
|
||
|
"*** YOUR CODE HERE ***"
|
||
|
# train_loss should be the loss of this iteration using only the training data that was used for the updates
|
||
|
# val_loss should be the loss of this iteration using the same amount of data used for the updates, but using the validation data instead
|
||
|
a, b = train_gen.next()
|
||
|
train_loss = session.run(loss_tensor, feed_dict = {model.input_ph: a, target_ph: b})
|
||
|
session.run(update_ops, feed_dict = {model.input_ph: a, target_ph: b})
|
||
|
c, d = val_gen.next()
|
||
|
val_loss = session.run(loss_tensor, feed_dict = {model.input_ph: c, target_ph: d})
|
||
|
|
||
|
train_losses.append(train_loss)
|
||
|
val_losses.append(val_loss)
|
||
|
if callback is not None: callback(model)
|
||
|
self.display_progress(iter_, train_losses, val_losses)
|
||
|
return train_losses, val_losses
|
||
|
|
||
|
|
||
|
class MinibatchStochasticGradientDescentSolver(GradientDescentSolver):
|
||
|
def __init__(self, learning_rate, iterations, batch_size, momentum=0, weight_decay=1e-3, shuffle=None, loss_function=None, plot=0):
|
||
|
"""
|
||
|
Minibatch stochastic gradient descent solver for optimizing a model given some data.
|
||
|
|
||
|
Args:
|
||
|
learning_rate: also known as alpha. Used for parameter updates.
|
||
|
iterations: number of gradient steps (i.e. updates) to perform when
|
||
|
solving a model.
|
||
|
batch_size: minibatch size to use when iterating the training and
|
||
|
validation data.
|
||
|
momentum: also known as mu. Used for velocity updates.
|
||
|
weight_decay: coefficient for l2 regularization on the loss.
|
||
|
shuffle: whether the order of the data points should be randomized
|
||
|
when iterating over the data
|
||
|
loss_function: loss function to use for the objective being optimized.
|
||
|
plot: whether to show a plot of the loss for every iteration.
|
||
|
"""
|
||
|
super(MinibatchStochasticGradientDescentSolver, self).__init__(
|
||
|
learning_rate, iterations, momentum=momentum, weight_decay=weight_decay, loss_function=loss_function, plot=plot)
|
||
|
self.shuffle = True if shuffle is None else shuffle
|
||
|
if not isinstance(batch_size, int) or batch_size < 0:
|
||
|
raise ValueError('batch_size should be a non-negative integer, %d given' % batch_size)
|
||
|
self.batch_size = batch_size
|
||
|
|
||
|
def solve(self, input_train_data, target_train_data, input_val_data, target_val_data, model, callback=None):
|
||
|
"""
|
||
|
Question 6.c: Optimize the model and return the intermediate losses.
|
||
|
|
||
|
Optimize the model using minibatch stochastic gradient descent by
|
||
|
running the variable updates for self.iterations iterations.
|
||
|
|
||
|
Args:
|
||
|
input_train_data: a numpy.array with shape (N, R)
|
||
|
target_train_data: a numpy.array with shape (N, S)
|
||
|
input_val_data: a numpy.array with shape (M, R)
|
||
|
target_val_data: a numpy.array with shape (M, S)
|
||
|
model: the model from which the parameters are optimized
|
||
|
|
||
|
Returns:
|
||
|
A tuple of lists, where the first list contains the training loss of
|
||
|
each iteration and the second list contains the validation loss of
|
||
|
each iteration. The validation loss should be computed using the
|
||
|
same amount of data as the training loss, but using the validation
|
||
|
data.
|
||
|
|
||
|
N and M are the numbers of training points, respectively, and R and S
|
||
|
are the dimensions for each input and target data point, respectively.
|
||
|
|
||
|
For minibatch stochastic gradient descent, you will need to iterate
|
||
|
over the data in minibatches. As before, you must use
|
||
|
MinibatchIndefinitelyGenerator to iterate over the data. You will
|
||
|
need to instantiate two generators (one for the training data and
|
||
|
another one for the validation data) and you should do it before the
|
||
|
for loop. You should read the docstring of
|
||
|
MinibatchIndefinitelyGenerator in tensorflow_util.py to figure out
|
||
|
how to use it. Make sure to pass in self.batch_size and self.shuffle
|
||
|
when you instantiate the generator.
|
||
|
|
||
|
Useful member variables and methods:
|
||
|
self.batch_size
|
||
|
self.shuffle
|
||
|
session.run(...)
|
||
|
generator.next()
|
||
|
"""
|
||
|
session = tfu.get_session()
|
||
|
target_ph = tf.placeholder(tf.float32, shape=(None,) + target_train_data.shape[1:])
|
||
|
placeholders = [model.input_ph, target_ph]
|
||
|
train_data = [input_train_data, target_train_data]
|
||
|
val_data = [input_val_data, target_val_data]
|
||
|
# You may want to initialize some variables that are shared across iterations
|
||
|
"*** YOUR CODE HERE ***"
|
||
|
train_gen = MinibatchIndefinitelyGenerator(train_data, self.batch_size, self.shuffle)
|
||
|
val_gen = MinibatchIndefinitelyGenerator(val_data, self.batch_size, self.shuffle)
|
||
|
|
||
|
loss_tensor = self.get_loss_tensor(model.prediction_tensor, target_ph, model.get_param_vars(regularizable=True))
|
||
|
updates = self.get_updates(loss_tensor, model.get_param_vars(trainable=True))
|
||
|
update_ops = [tf.assign(old_var, new_var_or_tensor) for (old_var, new_var_or_tensor) in updates]
|
||
|
train_losses = []
|
||
|
val_losses = []
|
||
|
for iter_ in range(self.iterations):
|
||
|
"*** YOUR CODE HERE ***"
|
||
|
a, b = train_gen.next()
|
||
|
train_loss = session.run(loss_tensor, feed_dict = {model.input_ph: a, target_ph: b})
|
||
|
session.run(update_ops, feed_dict = {model.input_ph: a, target_ph: b})
|
||
|
c, d = val_gen.next()
|
||
|
val_loss = session.run(loss_tensor, feed_dict = {model.input_ph: c, target_ph: d})
|
||
|
|
||
|
# train_loss should be the loss of this iteration using only the training data that was used for the updates
|
||
|
# val_loss should be the loss of this iteration using the same amount of data used for the updates, but using the validation data instead
|
||
|
train_losses.append(train_loss)
|
||
|
val_losses.append(val_loss)
|
||
|
|
||
|
if callback is not None: callback(model)
|
||
|
self.display_progress(iter_, train_losses, val_losses)
|
||
|
return train_losses, val_losses
|