Binary file not shown.
@ -0,0 +1,28 @@
|
||||
from .datasets import datasets # noqa: F401,F403
|
||||
from .testing import MathTest, MathTestVariable # noqa: F401,F403
|
||||
|
||||
# MODULE 0
|
||||
from .module import * # noqa: F401,F403
|
||||
|
||||
|
||||
# MODULE 1
|
||||
from .scalar import * # noqa: F401,F403
|
||||
from .autodiff import * # noqa: F401,F403
|
||||
|
||||
# MODULE 2
|
||||
from .tensor import * # noqa: F401,F403
|
||||
from .tensor_data import * # noqa: F401,F403
|
||||
from .tensor_functions import * # noqa: F401,F403
|
||||
from .tensor_ops import * # noqa: F401,F403
|
||||
|
||||
# MODULE 3
|
||||
from .fast_ops import * # noqa: F401,F403
|
||||
from .cuda_ops import * # noqa: F401,F403
|
||||
|
||||
# MODULE 4
|
||||
from .nn import * # noqa: F401,F403
|
||||
from .fast_conv import * # noqa: F401,F403
|
||||
|
||||
from .optim import * # noqa: F401,F403
|
||||
|
||||
version = "0.2"
|
||||
@ -0,0 +1,346 @@
|
||||
from collections import defaultdict
|
||||
|
||||
variable_count = 1
|
||||
|
||||
|
||||
# ## Module 1
|
||||
|
||||
# Variable is the main class for autodifferentiation logic for scalars
|
||||
# and tensors.
|
||||
|
||||
|
||||
class Variable:
|
||||
"""
|
||||
Attributes:
|
||||
history (:class:`History` or None) : the Function calls that created this variable or None if constant
|
||||
derivative (variable type): the derivative with respect to this variable
|
||||
grad (variable type) : alias for derivative, used for tensors
|
||||
name (string) : a globally unique name of the variable
|
||||
"""
|
||||
|
||||
def __init__(self, history, name=None):
|
||||
global variable_count
|
||||
assert history is None or isinstance(history, History), history
|
||||
|
||||
self.history = history
|
||||
self._derivative = None
|
||||
|
||||
# This is a bit simplistic, but make things easier.
|
||||
variable_count += 1
|
||||
self.unique_id = "Variable" + str(variable_count)
|
||||
|
||||
# For debugging can have a name.
|
||||
if name is not None:
|
||||
self.name = name
|
||||
else:
|
||||
self.name = self.unique_id
|
||||
self.used = 0
|
||||
|
||||
def requires_grad_(self, val):
|
||||
"""
|
||||
Set the requires_grad flag to `val` on variable.
|
||||
|
||||
Ensures that operations on this variable will trigger
|
||||
backpropagation.
|
||||
|
||||
Args:
|
||||
val (bool): whether to require grad
|
||||
"""
|
||||
self.history = History()
|
||||
|
||||
def backward(self, d_output=None):
|
||||
"""
|
||||
Calls autodiff to fill in the derivatives for the history of this object.
|
||||
|
||||
Args:
|
||||
d_output (number, opt): starting derivative to backpropagate through the model
|
||||
(typically left out, and assumed to be 1.0).
|
||||
"""
|
||||
if d_output is None:
|
||||
d_output = 1.0
|
||||
backpropagate(self, d_output)
|
||||
|
||||
@property
|
||||
def derivative(self):
|
||||
return self._derivative
|
||||
|
||||
def is_leaf(self):
|
||||
"True if this variable created by the user (no `last_fn`)"
|
||||
return self.history.last_fn is None
|
||||
|
||||
def accumulate_derivative(self, val):
|
||||
"""
|
||||
Add `val` to the the derivative accumulated on this variable.
|
||||
Should only be called during autodifferentiation on leaf variables.
|
||||
|
||||
Args:
|
||||
val (number): value to be accumulated
|
||||
"""
|
||||
assert self.is_leaf(), "Only leaf variables can have derivatives."
|
||||
if self._derivative is None:
|
||||
self._derivative = self.zeros()
|
||||
self._derivative += val
|
||||
|
||||
def zero_derivative_(self): # pragma: no cover
|
||||
"""
|
||||
Reset the derivative on this variable.
|
||||
"""
|
||||
self._derivative = self.zeros()
|
||||
|
||||
def zero_grad_(self): # pragma: no cover
|
||||
"""
|
||||
Reset the derivative on this variable.
|
||||
"""
|
||||
self.zero_derivative_()
|
||||
|
||||
def expand(self, x):
|
||||
"Placeholder for tensor variables"
|
||||
return x
|
||||
|
||||
# Helper functions for children classes.
|
||||
|
||||
def __radd__(self, b):
|
||||
return self + b
|
||||
|
||||
def __rmul__(self, b):
|
||||
return self * b
|
||||
|
||||
def zeros(self):
|
||||
return 0.0
|
||||
|
||||
|
||||
# Some helper functions for handling optional tuples.
|
||||
|
||||
|
||||
def wrap_tuple(x):
|
||||
"Turn a possible value into a tuple"
|
||||
if isinstance(x, tuple):
|
||||
return x
|
||||
return (x,)
|
||||
|
||||
|
||||
def unwrap_tuple(x):
|
||||
"Turn a singleton tuple into a value"
|
||||
if len(x) == 1:
|
||||
return x[0]
|
||||
return x
|
||||
|
||||
|
||||
# Classes for Functions.
|
||||
|
||||
|
||||
class Context:
|
||||
"""
|
||||
Context class is used by `Function` to store information during the forward pass.
|
||||
|
||||
Attributes:
|
||||
no_grad (bool) : do not save gradient information
|
||||
saved_values (tuple) : tuple of values saved for backward pass
|
||||
saved_tensors (tuple) : alias for saved_values
|
||||
"""
|
||||
|
||||
def __init__(self, no_grad=False):
|
||||
self._saved_values = None
|
||||
self.no_grad = no_grad
|
||||
|
||||
def save_for_backward(self, *values):
|
||||
"""
|
||||
Store the given `values` if they need to be used during backpropagation.
|
||||
|
||||
Args:
|
||||
values (list of values) : values to save for backward
|
||||
"""
|
||||
if self.no_grad:
|
||||
return
|
||||
self._saved_values = values
|
||||
|
||||
@property
|
||||
def saved_values(self):
|
||||
assert not self.no_grad, "Doesn't require grad"
|
||||
assert self._saved_values is not None, "Did you forget to save values?"
|
||||
return unwrap_tuple(self._saved_values)
|
||||
|
||||
@property
|
||||
def saved_tensors(self): # pragma: no cover
|
||||
return self.saved_values
|
||||
|
||||
|
||||
class History:
|
||||
"""
|
||||
`History` stores the history of `Function` operations that was
|
||||
used to construct the current Variable.
|
||||
|
||||
Attributes:
|
||||
last_fn (:class:`FunctionBase`) : The last Function that was called.
|
||||
ctx (:class:`Context`): The context for that Function.
|
||||
inputs (list of inputs) : The inputs that were given when `last_fn.forward` was called.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, last_fn=None, ctx=None, inputs=None):
|
||||
self.last_fn = last_fn
|
||||
self.ctx = ctx
|
||||
self.inputs = inputs
|
||||
|
||||
def backprop_step(self, d_output):
|
||||
"""
|
||||
Run one step of backpropagation by calling chain rule.
|
||||
|
||||
Args:
|
||||
d_output : a derivative with respect to this variable
|
||||
|
||||
Returns:
|
||||
list of numbers : a derivative with respect to `inputs`
|
||||
"""
|
||||
|
||||
return self.last_fn.chain_rule(self.ctx, self.inputs, d_output)
|
||||
|
||||
|
||||
class FunctionBase:
|
||||
"""
|
||||
A function that can act on :class:`Variable` arguments to
|
||||
produce a :class:`Variable` output, while tracking the internal history.
|
||||
|
||||
Call by :func:`FunctionBase.apply`.
|
||||
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def variable(raw, history):
|
||||
# Implement by children class.
|
||||
raise NotImplementedError()
|
||||
|
||||
@classmethod
|
||||
def apply(cls, *vals):
|
||||
"""
|
||||
Apply is called by the user to run the Function.
|
||||
Internally it does three things:
|
||||
|
||||
a) Creates a Context for the function call.
|
||||
b) Calls forward to run the function.
|
||||
c) Attaches the Context to the History of the new variable.
|
||||
|
||||
There is a bit of internal complexity in our implementation
|
||||
to handle both scalars and tensors.
|
||||
|
||||
Args:
|
||||
vals (list of Variables or constants) : The arguments to forward
|
||||
|
||||
Returns:
|
||||
`Variable` : The new variable produced
|
||||
|
||||
"""
|
||||
# Go through the variables to see if any needs grad.
|
||||
raw_vals = []
|
||||
need_grad = False
|
||||
for v in vals:
|
||||
if isinstance(v, Variable):
|
||||
if v.history is not None:
|
||||
need_grad = True
|
||||
v.used += 1
|
||||
raw_vals.append(v.get_data())
|
||||
else:
|
||||
raw_vals.append(v)
|
||||
|
||||
# Create the context.
|
||||
ctx = Context(not need_grad)
|
||||
|
||||
# Call forward with the variables.
|
||||
c = cls.forward(ctx, *raw_vals)
|
||||
assert isinstance(c, cls.data_type), "Expected return typ %s got %s" % (
|
||||
cls.data_type,
|
||||
type(c),
|
||||
)
|
||||
|
||||
# Create a new variable from the result with a new history.
|
||||
back = None
|
||||
if need_grad:
|
||||
back = History(cls, ctx, vals)
|
||||
return cls.variable(cls.data(c), back)
|
||||
|
||||
@classmethod
|
||||
def chain_rule(cls, ctx, inputs, d_output):
|
||||
"""
|
||||
Implement the derivative chain-rule.
|
||||
|
||||
Args:
|
||||
ctx (:class:`Context`) : The context from running forward
|
||||
inputs (list of args) : The args that were passed to :func:`FunctionBase.apply` (e.g. :math:`x, y`)
|
||||
d_output (number) : The `d_output` value in the chain rule.
|
||||
|
||||
Returns:
|
||||
list of (`Variable`, number) : A list of non-constant variables with their derivatives
|
||||
(see `is_constant` to remove unneeded variables)
|
||||
|
||||
"""
|
||||
# Tip: Note when implementing this function that
|
||||
# cls.backward may return either a value or a tuple.
|
||||
|
||||
d = cls.backward(ctx, d_output)
|
||||
for i, input in enumerate(inputs):
|
||||
if not is_constant(input):
|
||||
yield (input, d[i] if isinstance(d, tuple) else d)
|
||||
|
||||
|
||||
# Algorithms for backpropagation
|
||||
|
||||
|
||||
def is_constant(val):
|
||||
return not isinstance(val, Variable) or val.history is None
|
||||
|
||||
|
||||
def topological_sort(variable):
|
||||
"""
|
||||
Computes the topological order of the computation graph.
|
||||
|
||||
Args:
|
||||
variable (:class:`Variable`): The right-most variable
|
||||
|
||||
Returns:
|
||||
list of Variables : Non-constant Variables in topological order
|
||||
starting from the right.
|
||||
"""
|
||||
|
||||
sorted = []
|
||||
visited = set()
|
||||
|
||||
def visit(var):
|
||||
if var.unique_id in visited:
|
||||
return
|
||||
if not var.is_leaf():
|
||||
for input in var.history.inputs:
|
||||
if not is_constant(input):
|
||||
visit(input)
|
||||
visited.add(var.unique_id)
|
||||
sorted.insert(0, var)
|
||||
|
||||
visit(variable)
|
||||
return sorted
|
||||
|
||||
|
||||
def backpropagate(variable, deriv):
|
||||
"""
|
||||
Runs backpropagation on the computation graph in order to
|
||||
compute derivatives for the leave nodes.
|
||||
|
||||
See :doc:`backpropagate` for details on the algorithm.
|
||||
|
||||
Args:
|
||||
variable (:class:`Variable`): The right-most variable
|
||||
deriv (number) : Its derivative that we want to propagate backward to the leaves.
|
||||
|
||||
No return. Should write to its results to the derivative values of each leaf through `accumulate_derivative`.
|
||||
"""
|
||||
|
||||
sorted = topological_sort(variable)
|
||||
|
||||
d_dict = defaultdict(float)
|
||||
d_dict[variable.unique_id] = deriv
|
||||
for var in sorted:
|
||||
d = d_dict[var.unique_id]
|
||||
if not var.is_leaf():
|
||||
for v, d_part in var.history.backprop_step(d):
|
||||
d_dict[v.unique_id] += d_part
|
||||
else:
|
||||
var.accumulate_derivative(d)
|
||||
|
||||
@ -0,0 +1,512 @@
|
||||
# Credit (not tested): https://github.com/ethanglaser/minitorch/blob/8bfdac956d/module4/minitorch/cuda_ops.py
|
||||
|
||||
from numba import cuda
|
||||
import numba
|
||||
from .tensor_data import (
|
||||
to_index,
|
||||
index_to_position,
|
||||
TensorData,
|
||||
broadcast_index,
|
||||
shape_broadcast,
|
||||
MAX_DIMS,
|
||||
)
|
||||
|
||||
# This code will CUDA compile fast versions your tensor_data functions.
|
||||
# If you get an error, read the docs for NUMBA as to what is allowed
|
||||
# in these functions.
|
||||
|
||||
to_index = cuda.jit(device=True)(to_index)
|
||||
index_to_position = cuda.jit(device=True)(index_to_position)
|
||||
broadcast_index = cuda.jit(device=True)(broadcast_index)
|
||||
|
||||
THREADS_PER_BLOCK = 32
|
||||
|
||||
|
||||
def tensor_map(fn):
|
||||
"""
|
||||
CUDA higher-order tensor map function. ::
|
||||
|
||||
fn_map = tensor_map(fn)
|
||||
fn_map(out, ... )
|
||||
|
||||
Args:
|
||||
fn: function mappings floats-to-floats to apply.
|
||||
out (array): storage for out tensor.
|
||||
out_shape (array): shape for out tensor.
|
||||
out_strides (array): strides for out tensor.
|
||||
out_size (array): size for out tensor.
|
||||
in_storage (array): storage for in tensor.
|
||||
in_shape (array): shape for in tensor.
|
||||
in_strides (array): strides for in tensor.
|
||||
|
||||
Returns:
|
||||
None : Fills in `out`
|
||||
"""
|
||||
|
||||
def _map(out, out_shape, out_strides, out_size, in_storage, in_shape, in_strides):
|
||||
out_i = numba.cuda.blockIdx.x * THREADS_PER_BLOCK + numba.cuda.threadIdx.x
|
||||
if out_i < out.size:
|
||||
out_index = numba.cuda.local.array(MAX_DIMS, numba.int32)
|
||||
in_index = numba.cuda.local.array(MAX_DIMS, numba.int32)
|
||||
to_index(out_i, out_shape, out_index)
|
||||
broadcast_index(out_index, out_shape, in_shape, in_index)
|
||||
in_position = index_to_position(in_index, in_strides)
|
||||
out_position = index_to_position(out_index, out_strides)
|
||||
out[out_position] = fn(in_storage[in_position])
|
||||
|
||||
return cuda.jit()(_map)
|
||||
|
||||
|
||||
def map(fn):
|
||||
# CUDA compile your kernel
|
||||
f = tensor_map(cuda.jit(device=True)(fn))
|
||||
|
||||
def ret(a, out=None):
|
||||
if out is None:
|
||||
out = a.zeros(a.shape)
|
||||
|
||||
# Instantiate and run the cuda kernel.
|
||||
threadsperblock = THREADS_PER_BLOCK
|
||||
blockspergrid = (out.size + THREADS_PER_BLOCK - 1) // THREADS_PER_BLOCK
|
||||
f[blockspergrid, threadsperblock](*out.tuple(), out.size, *a.tuple())
|
||||
return out
|
||||
|
||||
return ret
|
||||
|
||||
|
||||
def tensor_zip(fn):
|
||||
"""
|
||||
CUDA higher-order tensor zipWith (or map2) function ::
|
||||
|
||||
fn_zip = tensor_zip(fn)
|
||||
fn_zip(out, ...)
|
||||
|
||||
Args:
|
||||
fn: function mappings two floats to float to apply.
|
||||
out (array): storage for `out` tensor.
|
||||
out_shape (array): shape for `out` tensor.
|
||||
out_strides (array): strides for `out` tensor.
|
||||
out_size (array): size for `out` tensor.
|
||||
a_storage (array): storage for `a` tensor.
|
||||
a_shape (array): shape for `a` tensor.
|
||||
a_strides (array): strides for `a` tensor.
|
||||
b_storage (array): storage for `b` tensor.
|
||||
b_shape (array): shape for `b` tensor.
|
||||
b_strides (array): strides for `b` tensor.
|
||||
|
||||
Returns:
|
||||
None : Fills in `out`
|
||||
"""
|
||||
|
||||
def _zip(
|
||||
out,
|
||||
out_shape,
|
||||
out_strides,
|
||||
out_size,
|
||||
a_storage,
|
||||
a_shape,
|
||||
a_strides,
|
||||
b_storage,
|
||||
b_shape,
|
||||
b_strides,
|
||||
):
|
||||
|
||||
out_i = numba.cuda.blockIdx.x * THREADS_PER_BLOCK + numba.cuda.threadIdx.x
|
||||
if out_i < out.size:
|
||||
out_index = numba.cuda.local.array(MAX_DIMS, numba.int32)
|
||||
a_index = numba.cuda.local.array(MAX_DIMS, numba.int32)
|
||||
b_index = numba.cuda.local.array(MAX_DIMS, numba.int32)
|
||||
|
||||
to_index(out_i, out_shape, out_index)
|
||||
broadcast_index(out_index, out_shape, a_shape, a_index)
|
||||
broadcast_index(out_index, out_shape, b_shape, b_index)
|
||||
a_position, b_position = (
|
||||
index_to_position(a_index, a_strides),
|
||||
index_to_position(b_index, b_strides),
|
||||
)
|
||||
out_position = index_to_position(out_index, out_strides)
|
||||
out[out_position] = fn(a_storage[a_position], b_storage[b_position])
|
||||
|
||||
return cuda.jit()(_zip)
|
||||
|
||||
|
||||
def zip(fn):
|
||||
f = tensor_zip(cuda.jit(device=True)(fn))
|
||||
|
||||
def ret(a, b):
|
||||
c_shape = shape_broadcast(a.shape, b.shape)
|
||||
out = a.zeros(c_shape)
|
||||
threadsperblock = THREADS_PER_BLOCK
|
||||
blockspergrid = (out.size + (threadsperblock - 1)) // threadsperblock
|
||||
f[blockspergrid, threadsperblock](
|
||||
*out.tuple(), out.size, *a.tuple(), *b.tuple()
|
||||
)
|
||||
return out
|
||||
|
||||
return ret
|
||||
|
||||
|
||||
def _sum_practice(out, a, size):
|
||||
"""
|
||||
This is a practice sum kernel to prepare for reduce.
|
||||
|
||||
Given an array of length :math:`n` and out of size :math:`n // blockDIM`
|
||||
it should sum up each blockDim values into an out cell.
|
||||
|
||||
[a_1, a_2, ..., a_100]
|
||||
|
||||
|
|
||||
|
||||
[a_1 +...+ a_32, a_32 + ... + a_64, ... ,]
|
||||
|
||||
Note: Each block must do the sum using shared memory!
|
||||
|
||||
Args:
|
||||
out (array): storage for `out` tensor.
|
||||
a (array): storage for `a` tensor.
|
||||
size (int): length of a.
|
||||
|
||||
"""
|
||||
BLOCK_DIM = 32
|
||||
local_idx = numba.cuda.threadIdx.x
|
||||
block_idx = numba.cuda.blockIdx.x
|
||||
shared_block = numba.cuda.shared.array(BLOCK_DIM, numba.float64)
|
||||
offset = 1
|
||||
|
||||
if block_idx * THREADS_PER_BLOCK + local_idx < size:
|
||||
shared_block[local_idx] = a[block_idx * THREADS_PER_BLOCK + local_idx]
|
||||
else:
|
||||
shared_block[local_idx] = 0
|
||||
|
||||
while offset < BLOCK_DIM:
|
||||
numba.cuda.syncthreads()
|
||||
if local_idx % (offset * 2) == 0:
|
||||
shared_block[local_idx] += shared_block[local_idx + offset]
|
||||
offset *= 2
|
||||
|
||||
out[block_idx] = shared_block[0]
|
||||
|
||||
|
||||
jit_sum_practice = cuda.jit()(_sum_practice)
|
||||
|
||||
|
||||
def sum_practice(a):
|
||||
(size,) = a.shape
|
||||
threadsperblock = THREADS_PER_BLOCK
|
||||
blockspergrid = (size // THREADS_PER_BLOCK) + 1
|
||||
out = TensorData([0.0 for i in range(2)], (2,))
|
||||
out.to_cuda_()
|
||||
jit_sum_practice[blockspergrid, threadsperblock](
|
||||
out.tuple()[0], a._tensor._storage, size
|
||||
)
|
||||
return out
|
||||
|
||||
|
||||
def tensor_reduce(fn):
|
||||
"""
|
||||
CUDA higher-order tensor reduce function.
|
||||
|
||||
Args:
|
||||
fn: reduction function maps two floats to float.
|
||||
out (array): storage for `out` tensor.
|
||||
out_shape (array): shape for `out` tensor.
|
||||
out_strides (array): strides for `out` tensor.
|
||||
out_size (array): size for `out` tensor.
|
||||
a_storage (array): storage for `a` tensor.
|
||||
a_shape (array): shape for `a` tensor.
|
||||
a_strides (array): strides for `a` tensor.
|
||||
reduce_dim (int): dimension to reduce out
|
||||
|
||||
Returns:
|
||||
None : Fills in `out`
|
||||
"""
|
||||
|
||||
def _reduce(
|
||||
out,
|
||||
out_shape,
|
||||
out_strides,
|
||||
out_size,
|
||||
a_storage,
|
||||
a_shape,
|
||||
a_strides,
|
||||
reduce_dim,
|
||||
reduce_value,
|
||||
):
|
||||
BLOCK_DIM = 1024
|
||||
reduce_size = a_shape[reduce_dim]
|
||||
local_idx = numba.cuda.threadIdx.x
|
||||
block_idx = numba.cuda.blockIdx.x
|
||||
shared_block = numba.cuda.shared.array(BLOCK_DIM, numba.float64)
|
||||
offset = 1
|
||||
|
||||
out_index = numba.cuda.local.array(MAX_DIMS, numba.int32)
|
||||
to_index(block_idx, out_shape, out_index)
|
||||
out_position = index_to_position(out_index, out_strides)
|
||||
|
||||
if local_idx < reduce_size:
|
||||
out_index[reduce_dim] = local_idx
|
||||
shared_block[local_idx] = a_storage[index_to_position(out_index, a_strides)]
|
||||
else:
|
||||
shared_block[local_idx] = reduce_value
|
||||
|
||||
while offset < BLOCK_DIM:
|
||||
numba.cuda.syncthreads()
|
||||
if local_idx % (offset * 2) == 0:
|
||||
shared_block[local_idx] = fn(
|
||||
shared_block[local_idx], shared_block[local_idx + offset]
|
||||
)
|
||||
offset *= 2
|
||||
|
||||
numba.cuda.syncthreads()
|
||||
if local_idx == 0:
|
||||
out[out_position] = shared_block[local_idx]
|
||||
|
||||
return cuda.jit()(_reduce)
|
||||
|
||||
|
||||
def reduce(fn, start=0.0):
|
||||
"""
|
||||
Higher-order tensor reduce function. ::
|
||||
|
||||
fn_reduce = reduce(fn)
|
||||
out = fn_reduce(a, dim)
|
||||
|
||||
Simple version ::
|
||||
|
||||
for j:
|
||||
out[1, j] = start
|
||||
for i:
|
||||
out[1, j] = fn(out[1, j], a[i, j])
|
||||
|
||||
|
||||
Args:
|
||||
fn: function from two floats-to-float to apply
|
||||
a (:class:`Tensor`): tensor to reduce over
|
||||
dim (int): int of dim to reduce
|
||||
|
||||
Returns:
|
||||
:class:`Tensor` : new tensor
|
||||
"""
|
||||
f = tensor_reduce(cuda.jit(device=True)(fn))
|
||||
|
||||
def ret(a, dim):
|
||||
out_shape = list(a.shape)
|
||||
out_shape[dim] = (a.shape[dim] - 1) // 1024 + 1
|
||||
out_a = a.zeros(tuple(out_shape))
|
||||
|
||||
threadsperblock = 1024
|
||||
blockspergrid = out_a.size
|
||||
f[blockspergrid, threadsperblock](
|
||||
*out_a.tuple(), out_a.size, *a.tuple(), dim, start
|
||||
)
|
||||
|
||||
return out_a
|
||||
|
||||
return ret
|
||||
|
||||
|
||||
def _mm_practice(out, a, b, size):
|
||||
"""
|
||||
This is a practice square MM kernel to prepare for matmul.
|
||||
|
||||
Given a storage `out` and two storage `a` and `b`. Where we know
|
||||
both are shape [size, size] with strides [size, 1].
|
||||
|
||||
Size is always < 32.
|
||||
|
||||
Requirements:
|
||||
|
||||
* All data must be first moved to shared memory.
|
||||
* Only read each cell in `a` and `b` once.
|
||||
* Only write to global memory once per kernel.
|
||||
|
||||
Compute ::
|
||||
|
||||
for i:
|
||||
for j:
|
||||
for k:
|
||||
out[i, j] += a[i, k] * b[k, j]
|
||||
|
||||
Args:
|
||||
out (array): storage for `out` tensor.
|
||||
a (array): storage for `a` tensor.
|
||||
b (array): storage for `a` tensor.
|
||||
size (int): size of the square
|
||||
|
||||
"""
|
||||
BLOCK_DIM = 32
|
||||
shared_a = numba.cuda.shared.array((BLOCK_DIM, BLOCK_DIM), numba.float64)
|
||||
shared_b = numba.cuda.shared.array((BLOCK_DIM, BLOCK_DIM), numba.float64)
|
||||
|
||||
y = numba.cuda.threadIdx.y
|
||||
x = numba.cuda.threadIdx.x
|
||||
if x < size and y < size:
|
||||
shared_a[y, x] = a[y * size + x]
|
||||
shared_b[y, x] = b[y * size + x]
|
||||
else:
|
||||
shared_a[y, x] = 0
|
||||
shared_b[y, x] = 0
|
||||
numba.cuda.syncthreads()
|
||||
|
||||
if y < size and x < size:
|
||||
temp = 0
|
||||
for val in range(size):
|
||||
temp += shared_a[y, val] * shared_b[val, x]
|
||||
out[y * size + x] = temp
|
||||
|
||||
|
||||
jit_mm_practice = cuda.jit()(_mm_practice)
|
||||
|
||||
|
||||
def mm_practice(a, b):
|
||||
|
||||
(size, _) = a.shape
|
||||
threadsperblock = (THREADS_PER_BLOCK, THREADS_PER_BLOCK)
|
||||
blockspergrid = 1
|
||||
out = TensorData([0.0 for i in range(size * size)], (size, size))
|
||||
out.to_cuda_()
|
||||
jit_mm_practice[blockspergrid, threadsperblock](
|
||||
out.tuple()[0], a._tensor._storage, b._tensor._storage, size
|
||||
)
|
||||
return out
|
||||
|
||||
|
||||
@cuda.jit()
|
||||
def tensor_matrix_multiply(
|
||||
out,
|
||||
out_shape,
|
||||
out_strides,
|
||||
out_size,
|
||||
a_storage,
|
||||
a_shape,
|
||||
a_strides,
|
||||
b_storage,
|
||||
b_shape,
|
||||
b_strides,
|
||||
):
|
||||
"""
|
||||
CUDA tensor matrix multiply function.
|
||||
|
||||
Requirements:
|
||||
|
||||
* All data must be first moved to shared memory.
|
||||
* Only read each cell in `a` and `b` once.
|
||||
* Only write to global memory once per kernel.
|
||||
|
||||
Should work for any tensor shapes that broadcast as long as ::
|
||||
|
||||
assert a_shape[-1] == b_shape[-2]
|
||||
|
||||
Args:
|
||||
out (array): storage for `out` tensor
|
||||
out_shape (array): shape for `out` tensor
|
||||
out_strides (array): strides for `out` tensor
|
||||
out_size (array): size for `out` tensor.
|
||||
a_storage (array): storage for `a` tensor
|
||||
a_shape (array): shape for `a` tensor
|
||||
a_strides (array): strides for `a` tensor
|
||||
b_storage (array): storage for `b` tensor
|
||||
b_shape (array): shape for `b` tensor
|
||||
b_strides (array): strides for `b` tensor
|
||||
|
||||
Returns:
|
||||
None : Fills in `out`
|
||||
"""
|
||||
a_batch_stride = a_strides[0] if a_shape[0] > 1 else 0
|
||||
b_batch_stride = b_strides[0] if b_shape[0] > 1 else 0
|
||||
BLOCK_DIM = 32
|
||||
shared_a = numba.cuda.shared.array((BLOCK_DIM, BLOCK_DIM), numba.float64)
|
||||
shared_b = numba.cuda.shared.array((BLOCK_DIM, BLOCK_DIM), numba.float64)
|
||||
|
||||
y = numba.cuda.threadIdx.y
|
||||
x = numba.cuda.threadIdx.x
|
||||
block_x = numba.cuda.blockIdx.x * BLOCK_DIM
|
||||
block_y = numba.cuda.blockIdx.y * BLOCK_DIM
|
||||
z = numba.cuda.blockIdx.z
|
||||
|
||||
temp = 0
|
||||
for block_index in range((a_shape[-1] + (BLOCK_DIM - 1)) // BLOCK_DIM):
|
||||
block_mid = block_index * BLOCK_DIM
|
||||
if (block_mid + x) < a_shape[-1] and (block_y + y) < a_shape[-2]:
|
||||
shared_a[y, x] = a_storage[
|
||||
z * a_batch_stride
|
||||
+ (block_mid + x) * a_strides[-1]
|
||||
+ (block_y + y) * a_strides[-2]
|
||||
]
|
||||
else:
|
||||
shared_a[y, x] = 0
|
||||
if (block_x + x) < b_shape[-1] and (block_mid + y) < b_shape[-2]:
|
||||
shared_b[y, x] = b_storage[
|
||||
z * b_batch_stride
|
||||
+ (block_x + x) * b_strides[-1]
|
||||
+ (block_mid + y) * b_strides[-2]
|
||||
]
|
||||
else:
|
||||
shared_b[y, x] = 0
|
||||
numba.cuda.syncthreads()
|
||||
|
||||
for val in range(BLOCK_DIM):
|
||||
temp += shared_a[y, val] * shared_b[val, x]
|
||||
|
||||
if (block_y + y) < out_shape[-2] and (block_x + x) < out_shape[-1]:
|
||||
out[
|
||||
z * out_strides[0]
|
||||
+ (block_y + y) * out_strides[-2]
|
||||
+ (block_x + x) * out_strides[-1]
|
||||
] = temp
|
||||
|
||||
|
||||
def matrix_multiply(a, b):
|
||||
"""
|
||||
Tensor matrix multiply
|
||||
Should work for any tensor shapes that broadcast in the first n-2 dims and
|
||||
have ::
|
||||
assert a.shape[-1] == b.shape[-2]
|
||||
|
||||
Args:
|
||||
a (:class:`Tensor`): tensor a
|
||||
b (:class:`Tensor`): tensor b
|
||||
|
||||
Returns:
|
||||
:class:`Tensor` : new tensor
|
||||
"""
|
||||
|
||||
# Make these always be a 3 dimensional multiply
|
||||
both_2d = 0
|
||||
if len(a.shape) == 2:
|
||||
a = a.contiguous().view(1, a.shape[0], a.shape[1])
|
||||
both_2d += 1
|
||||
if len(b.shape) == 2:
|
||||
b = b.contiguous().view(1, b.shape[0], b.shape[1])
|
||||
both_2d += 1
|
||||
both_2d = both_2d == 2
|
||||
|
||||
ls = list(shape_broadcast(a.shape[:-2], b.shape[:-2]))
|
||||
ls.append(a.shape[-2])
|
||||
ls.append(b.shape[-1])
|
||||
assert a.shape[-1] == b.shape[-2]
|
||||
out = a.zeros(tuple(ls))
|
||||
|
||||
# One block per batch, extra rows, extra col
|
||||
blockspergrid = (
|
||||
(out.shape[1] + (THREADS_PER_BLOCK - 1)) // THREADS_PER_BLOCK,
|
||||
(out.shape[2] + (THREADS_PER_BLOCK - 1)) // THREADS_PER_BLOCK,
|
||||
out.shape[0],
|
||||
)
|
||||
threadsperblock = (THREADS_PER_BLOCK, THREADS_PER_BLOCK, 1)
|
||||
|
||||
tensor_matrix_multiply[blockspergrid, threadsperblock](
|
||||
*out.tuple(), out.size, *a.tuple(), *b.tuple()
|
||||
)
|
||||
|
||||
# Undo 3d if we added it.
|
||||
if both_2d:
|
||||
out = out.view(out.shape[1], out.shape[2])
|
||||
return out
|
||||
|
||||
|
||||
class CudaOps:
|
||||
map = map
|
||||
zip = zip
|
||||
reduce = reduce
|
||||
matrix_multiply = matrix_multiply
|
||||
@ -0,0 +1,94 @@
|
||||
from dataclasses import dataclass
|
||||
import random
|
||||
import math
|
||||
|
||||
|
||||
def make_pts(N):
|
||||
X = []
|
||||
for i in range(N):
|
||||
x_1 = random.random()
|
||||
x_2 = random.random()
|
||||
X.append((x_1, x_2))
|
||||
return X
|
||||
|
||||
|
||||
@dataclass
|
||||
class Graph:
|
||||
N: int
|
||||
X: list
|
||||
y: list
|
||||
|
||||
|
||||
def simple(N):
|
||||
X = make_pts(N)
|
||||
y = []
|
||||
for x_1, x_2 in X:
|
||||
y1 = 1 if x_1 < 0.5 else 0
|
||||
y.append(y1)
|
||||
return Graph(N, X, y)
|
||||
|
||||
|
||||
def diag(N):
|
||||
X = make_pts(N)
|
||||
y = []
|
||||
for x_1, x_2 in X:
|
||||
y1 = 1 if x_1 + x_2 < 0.5 else 0
|
||||
y.append(y1)
|
||||
return Graph(N, X, y)
|
||||
|
||||
|
||||
def split(N):
|
||||
X = make_pts(N)
|
||||
y = []
|
||||
for x_1, x_2 in X:
|
||||
y1 = 1 if x_1 < 0.2 or x_1 > 0.8 else 0
|
||||
y.append(y1)
|
||||
return Graph(N, X, y)
|
||||
|
||||
|
||||
def xor(N):
|
||||
X = make_pts(N)
|
||||
y = []
|
||||
for x_1, x_2 in X:
|
||||
y1 = 1 if ((x_1 < 0.5 and x_2 > 0.5) or (x_1 > 0.5 and x_2 < 0.5)) else 0
|
||||
y.append(y1)
|
||||
return Graph(N, X, y)
|
||||
|
||||
|
||||
def circle(N):
|
||||
X = make_pts(N)
|
||||
y = []
|
||||
for x_1, x_2 in X:
|
||||
x1, x2 = (x_1 - 0.5, x_2 - 0.5)
|
||||
y1 = 1 if x1 * x1 + x2 * x2 > 0.1 else 0
|
||||
y.append(y1)
|
||||
return Graph(N, X, y)
|
||||
|
||||
|
||||
def spiral(N):
|
||||
def x(t):
|
||||
return t * math.cos(t) / 20.0
|
||||
|
||||
def y(t):
|
||||
return t * math.sin(t) / 20.0
|
||||
|
||||
X = [
|
||||
(x(10.0 * (float(i) / (N // 2))) + 0.5, y(10.0 * (float(i) / (N // 2))) + 0.5)
|
||||
for i in range(5 + 0, 5 + N // 2)
|
||||
]
|
||||
X = X + [
|
||||
(y(-10.0 * (float(i) / (N // 2))) + 0.5, x(-10.0 * (float(i) / (N // 2))) + 0.5)
|
||||
for i in range(5 + 0, 5 + N // 2)
|
||||
]
|
||||
y = [0] * (N // 2) + [1] * (N // 2)
|
||||
return Graph(N, X, y)
|
||||
|
||||
|
||||
datasets = {
|
||||
"Simple": simple,
|
||||
"Diag": diag,
|
||||
"Split": split,
|
||||
"Xor": xor,
|
||||
"Circle": circle,
|
||||
"Spiral": spiral,
|
||||
}
|
||||
@ -0,0 +1,333 @@
|
||||
import numpy as np
|
||||
from .tensor_data import (
|
||||
to_index,
|
||||
index_to_position,
|
||||
broadcast_index,
|
||||
MAX_DIMS,
|
||||
)
|
||||
from .tensor_functions import Function
|
||||
from numba import njit, prange
|
||||
|
||||
|
||||
# This code will JIT compile fast versions your tensor_data functions.
|
||||
# If you get an error, read the docs for NUMBA as to what is allowed
|
||||
# in these functions.
|
||||
to_index = njit(inline="always")(to_index)
|
||||
index_to_position = njit(inline="always")(index_to_position)
|
||||
broadcast_index = njit(inline="always")(broadcast_index)
|
||||
|
||||
|
||||
@njit(parallel=True)
|
||||
def tensor_conv1d(
|
||||
out,
|
||||
out_shape,
|
||||
out_strides,
|
||||
out_size,
|
||||
input,
|
||||
input_shape,
|
||||
input_strides,
|
||||
weight,
|
||||
weight_shape,
|
||||
weight_strides,
|
||||
reverse,
|
||||
):
|
||||
"""
|
||||
1D Convolution implementation.
|
||||
|
||||
Given input tensor of
|
||||
|
||||
`batch, in_channels, width`
|
||||
|
||||
and weight tensor
|
||||
|
||||
`out_channels, in_channels, k_width`
|
||||
|
||||
Computes padded output of
|
||||
|
||||
`batch, out_channels, width`
|
||||
|
||||
`reverse` decides if weight is anchored left (False) or right.
|
||||
(See diagrams)
|
||||
|
||||
Args:
|
||||
out (array): storage for `out` tensor.
|
||||
out_shape (array): shape for `out` tensor.
|
||||
out_strides (array): strides for `out` tensor.
|
||||
out_size (int): size of the `out` tensor.
|
||||
input (array): storage for `input` tensor.
|
||||
input_shape (array): shape for `input` tensor.
|
||||
input_strides (array): strides for `input` tensor.
|
||||
weight (array): storage for `input` tensor.
|
||||
weight_shape (array): shape for `input` tensor.
|
||||
weight_strides (array): strides for `input` tensor.
|
||||
reverse (bool): anchor weight at left or right
|
||||
"""
|
||||
batch_, out_channels, out_width = out_shape
|
||||
batch, in_channels, width = input_shape
|
||||
out_channels_, in_channels_, kw = weight_shape
|
||||
|
||||
assert (
|
||||
batch == batch_
|
||||
and in_channels == in_channels_
|
||||
and out_channels == out_channels_
|
||||
)
|
||||
s1 = input_strides
|
||||
s2 = weight_strides
|
||||
for out_i in prange(out_size):
|
||||
out_index = np.zeros(3, dtype=np.int32)
|
||||
to_index(out_i, out_shape, out_index)
|
||||
current_batch, current_out_channels, current_width = out_index
|
||||
val = 0
|
||||
for current_in_channels in prange(in_channels):
|
||||
for current_kw in range(kw):
|
||||
i = current_kw
|
||||
if reverse:
|
||||
i = kw - current_kw - 1
|
||||
w = weight[
|
||||
s2[0] * current_out_channels
|
||||
+ s2[1] * current_in_channels
|
||||
+ s2[2] * i
|
||||
]
|
||||
inc = 0
|
||||
if reverse:
|
||||
if current_width - i >= 0:
|
||||
inc = input[
|
||||
current_batch * s1[0]
|
||||
+ current_in_channels * s1[1]
|
||||
+ (current_width - i) * s1[2]
|
||||
]
|
||||
else:
|
||||
if i + current_width < width:
|
||||
inc = input[
|
||||
current_batch * s1[0]
|
||||
+ current_in_channels * s1[1]
|
||||
+ (i + current_width) * s1[2]
|
||||
]
|
||||
val += w * inc
|
||||
out[
|
||||
current_batch * out_strides[0]
|
||||
+ current_out_channels * out_strides[1]
|
||||
+ current_width * out_strides[2]
|
||||
] = val
|
||||
|
||||
|
||||
class Conv1dFun(Function):
|
||||
@staticmethod
|
||||
def forward(ctx, input, weight):
|
||||
"""
|
||||
Compute a 1D Convolution
|
||||
|
||||
Args:
|
||||
ctx : Context
|
||||
input (:class:`Tensor`) : batch x in_channel x h x w
|
||||
weight (:class:`Tensor`) : out_channel x in_channel x kh x kw
|
||||
|
||||
Returns:
|
||||
(:class:`Tensor`) : batch x out_channel x h x w
|
||||
"""
|
||||
ctx.save_for_backward(input, weight)
|
||||
batch, in_channels, w = input.shape
|
||||
out_channels, in_channels2, kw = weight.shape
|
||||
assert in_channels == in_channels2
|
||||
|
||||
# Run convolution
|
||||
output = input.zeros((batch, out_channels, w))
|
||||
tensor_conv1d(
|
||||
*output.tuple(), output.size, *input.tuple(), *weight.tuple(), False
|
||||
)
|
||||
return output
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
input, weight = ctx.saved_values
|
||||
batch, in_channels, w = input.shape
|
||||
out_channels, in_channels, kw = weight.shape
|
||||
grad_weight = grad_output.zeros((in_channels, out_channels, kw))
|
||||
new_input = input.permute(1, 0, 2)
|
||||
new_grad_output = grad_output.permute(1, 0, 2)
|
||||
tensor_conv1d(
|
||||
*grad_weight.tuple(),
|
||||
grad_weight.size,
|
||||
*new_input.tuple(),
|
||||
*new_grad_output.tuple(),
|
||||
False,
|
||||
)
|
||||
grad_weight = grad_weight.permute(1, 0, 2)
|
||||
|
||||
grad_input = input.zeros((batch, in_channels, w))
|
||||
new_weight = weight.permute(1, 0, 2)
|
||||
tensor_conv1d(
|
||||
*grad_input.tuple(),
|
||||
grad_input.size,
|
||||
*grad_output.tuple(),
|
||||
*new_weight.tuple(),
|
||||
True,
|
||||
)
|
||||
return grad_input, grad_weight
|
||||
|
||||
|
||||
conv1d = Conv1dFun.apply
|
||||
|
||||
|
||||
@njit(parallel=True, fastmath=True)
|
||||
def tensor_conv2d(
|
||||
out,
|
||||
out_shape,
|
||||
out_strides,
|
||||
out_size,
|
||||
input,
|
||||
input_shape,
|
||||
input_strides,
|
||||
weight,
|
||||
weight_shape,
|
||||
weight_strides,
|
||||
reverse,
|
||||
):
|
||||
"""
|
||||
2D Convolution implementation.
|
||||
|
||||
Given input tensor of
|
||||
|
||||
`batch, in_channels, height, width`
|
||||
|
||||
and weight tensor
|
||||
|
||||
`out_channels, in_channels, k_height, k_width`
|
||||
|
||||
Computes padded output of
|
||||
|
||||
`batch, out_channels, height, width`
|
||||
|
||||
`Reverse` decides if weight is anchored top-left (False) or bottom-right.
|
||||
(See diagrams)
|
||||
|
||||
|
||||
Args:
|
||||
out (array): storage for `out` tensor.
|
||||
out_shape (array): shape for `out` tensor.
|
||||
out_strides (array): strides for `out` tensor.
|
||||
out_size (int): size of the `out` tensor.
|
||||
input (array): storage for `input` tensor.
|
||||
input_shape (array): shape for `input` tensor.
|
||||
input_strides (array): strides for `input` tensor.
|
||||
weight (array): storage for `input` tensor.
|
||||
weight_shape (array): shape for `input` tensor.
|
||||
weight_strides (array): strides for `input` tensor.
|
||||
reverse (bool): anchor weight at top-left or bottom-right
|
||||
"""
|
||||
batch_, out_channels, _, _ = out_shape
|
||||
batch, in_channels, height, width = input_shape
|
||||
out_channels_, in_channels_, kh, kw = weight_shape
|
||||
|
||||
assert (
|
||||
batch == batch_
|
||||
and in_channels == in_channels_
|
||||
and out_channels == out_channels_
|
||||
)
|
||||
|
||||
s1 = input_strides
|
||||
s2 = weight_strides
|
||||
# inners
|
||||
s10, s11, s12, s13 = s1[0], s1[1], s1[2], s1[3]
|
||||
s20, s21, s22, s23 = s2[0], s2[1], s2[2], s2[3]
|
||||
|
||||
for out_i in prange(out_size):
|
||||
out_index = np.zeros(4, dtype=np.int32)
|
||||
to_index(out_i, out_shape, out_index)
|
||||
current_batch, current_out_channels, current_height, current_width = out_index
|
||||
val = 0
|
||||
for current_in_channels in prange(in_channels):
|
||||
for current_kh in range(kh):
|
||||
for current_kw in range(kw):
|
||||
i = current_kh
|
||||
j = current_kw
|
||||
if reverse:
|
||||
j = kw - current_kw - 1
|
||||
i = kh - current_kh - 1
|
||||
w = weight[
|
||||
s20 * current_out_channels
|
||||
+ s21 * current_in_channels
|
||||
+ s22 * i
|
||||
+ s23 * j
|
||||
]
|
||||
inc = 0
|
||||
if reverse:
|
||||
if current_height - i >= 0 and current_width - j >= 0:
|
||||
inc = input[
|
||||
current_batch * s10
|
||||
+ current_in_channels * s11
|
||||
+ (current_height - i) * s12
|
||||
+ (current_width - j) * s13
|
||||
]
|
||||
else:
|
||||
if i + current_height < height and j + current_width < width:
|
||||
inc = input[
|
||||
current_batch * s10
|
||||
+ current_in_channels * s11
|
||||
+ (i + current_height) * s12
|
||||
+ (j + current_width) * s13
|
||||
]
|
||||
val += w * inc
|
||||
out[
|
||||
current_batch * out_strides[0]
|
||||
+ current_out_channels * out_strides[1]
|
||||
+ current_height * out_strides[2]
|
||||
+ current_width * out_strides[3]
|
||||
] = val
|
||||
|
||||
|
||||
class Conv2dFun(Function):
|
||||
@staticmethod
|
||||
def forward(ctx, input, weight):
|
||||
"""
|
||||
Compute a 2D Convolution
|
||||
|
||||
Args:
|
||||
ctx : Context
|
||||
input (:class:`Tensor`) : batch x in_channel x h x w
|
||||
weight (:class:`Tensor`) : out_channel x in_channel x kh x kw
|
||||
|
||||
Returns:
|
||||
(:class:`Tensor`) : batch x out_channel x h x w
|
||||
"""
|
||||
ctx.save_for_backward(input, weight)
|
||||
batch, in_channels, h, w = input.shape
|
||||
out_channels, in_channels2, kh, kw = weight.shape
|
||||
assert in_channels == in_channels2
|
||||
output = input.zeros((batch, out_channels, h, w))
|
||||
tensor_conv2d(
|
||||
*output.tuple(), output.size, *input.tuple(), *weight.tuple(), False
|
||||
)
|
||||
return output
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
input, weight = ctx.saved_values
|
||||
batch, in_channels, h, w = input.shape
|
||||
out_channels, in_channels, kh, kw = weight.shape
|
||||
|
||||
grad_weight = grad_output.zeros((in_channels, out_channels, kh, kw))
|
||||
new_input = input.permute(1, 0, 2, 3)
|
||||
new_grad_output = grad_output.permute(1, 0, 2, 3)
|
||||
tensor_conv2d(
|
||||
*grad_weight.tuple(),
|
||||
grad_weight.size,
|
||||
*new_input.tuple(),
|
||||
*new_grad_output.tuple(),
|
||||
False,
|
||||
)
|
||||
grad_weight = grad_weight.permute(1, 0, 2, 3)
|
||||
|
||||
grad_input = input.zeros((batch, in_channels, h, w))
|
||||
new_weight = weight.permute(1, 0, 2, 3)
|
||||
tensor_conv2d(
|
||||
*grad_input.tuple(),
|
||||
grad_input.size,
|
||||
*grad_output.tuple(),
|
||||
*new_weight.tuple(),
|
||||
True,
|
||||
)
|
||||
return grad_input, grad_weight
|
||||
|
||||
|
||||
conv2d = Conv2dFun.apply
|
||||
@ -0,0 +1,364 @@
|
||||
import numpy as np
|
||||
from numba import njit, prange
|
||||
|
||||
from .tensor_data import (MAX_DIMS, broadcast_index, index_to_position,
|
||||
shape_broadcast, to_index)
|
||||
|
||||
# TIP: Use `NUMBA_DISABLE_JIT=1 pytest tests/ -m task3_1` to run these tests without JIT.
|
||||
|
||||
# This code will JIT compile fast versions your tensor_data functions.
|
||||
# If you get an error, read the docs for NUMBA as to what is allowed
|
||||
# in these functions.
|
||||
to_index = njit(inline="always")(to_index)
|
||||
index_to_position = njit(inline="always")(index_to_position)
|
||||
broadcast_index = njit(inline="always")(broadcast_index)
|
||||
|
||||
|
||||
def tensor_map(fn):
|
||||
"""
|
||||
NUMBA low_level tensor_map function. See `tensor_ops.py` for description.
|
||||
|
||||
Optimizations:
|
||||
|
||||
* Main loop in parallel
|
||||
* All indices use numpy buffers
|
||||
* When `out` and `in` are stride-aligned, avoid indexing
|
||||
|
||||
Args:
|
||||
fn: function mappings floats-to-floats to apply.
|
||||
out (array): storage for out tensor.
|
||||
out_shape (array): shape for out tensor.
|
||||
out_strides (array): strides for out tensor.
|
||||
in_storage (array): storage for in tensor.
|
||||
in_shape (array): shape for in tensor.
|
||||
in_strides (array): strides for in tensor.
|
||||
|
||||
Returns:
|
||||
None : Fills in `out`
|
||||
"""
|
||||
|
||||
def _map(out, out_shape, out_strides, in_storage, in_shape, in_strides):
|
||||
in_index = np.zeros(len(in_shape), dtype=int)
|
||||
out_index = np.zeros(len(out_shape), dtype=int)
|
||||
|
||||
for i in prange(len(out)):
|
||||
to_index(i, out_shape, out_index)
|
||||
broadcast_index(out_index, out_shape, in_shape, in_index)
|
||||
|
||||
x = in_storage[index_to_position(in_index, in_strides)]
|
||||
index = index_to_position(out_index, out_strides)
|
||||
out[index] = fn(x)
|
||||
|
||||
return njit(parallel=True)(_map)
|
||||
|
||||
|
||||
def map(fn):
|
||||
"""
|
||||
Higher-order tensor map function ::
|
||||
|
||||
fn_map = map(fn)
|
||||
fn_map(a, out)
|
||||
out
|
||||
|
||||
Args:
|
||||
fn: function from float-to-float to apply.
|
||||
a (:class:`Tensor`): tensor to map over
|
||||
out (:class:`Tensor`): optional, tensor data to fill in,
|
||||
should broadcast with `a`
|
||||
|
||||
Returns:
|
||||
:class:`Tensor` : new tensor
|
||||
"""
|
||||
|
||||
# This line JIT compiles your tensor_map
|
||||
f = tensor_map(njit()(fn))
|
||||
|
||||
def ret(a, out=None):
|
||||
if out is None:
|
||||
out = a.zeros(a.shape)
|
||||
f(*out.tuple(), *a.tuple())
|
||||
return out
|
||||
|
||||
return ret
|
||||
|
||||
|
||||
def tensor_zip(fn):
|
||||
"""
|
||||
NUMBA higher-order tensor zip function. See `tensor_ops.py` for description.
|
||||
|
||||
|
||||
Optimizations:
|
||||
|
||||
* Main loop in parallel
|
||||
* All indices use numpy buffers
|
||||
* When `out`, `a`, `b` are stride-aligned, avoid indexing
|
||||
|
||||
Args:
|
||||
fn: function maps two floats to float to apply.
|
||||
out (array): storage for `out` tensor.
|
||||
out_shape (array): shape for `out` tensor.
|
||||
out_strides (array): strides for `out` tensor.
|
||||
a_storage (array): storage for `a` tensor.
|
||||
a_shape (array): shape for `a` tensor.
|
||||
a_strides (array): strides for `a` tensor.
|
||||
b_storage (array): storage for `b` tensor.
|
||||
b_shape (array): shape for `b` tensor.
|
||||
b_strides (array): strides for `b` tensor.
|
||||
|
||||
Returns:
|
||||
None : Fills in `out`
|
||||
"""
|
||||
|
||||
def _zip(
|
||||
out,
|
||||
out_shape,
|
||||
out_strides,
|
||||
a_storage,
|
||||
a_shape,
|
||||
a_strides,
|
||||
b_storage,
|
||||
b_shape,
|
||||
b_strides,
|
||||
):
|
||||
out_index = np.zeros(len(out_shape), dtype=int)
|
||||
a_in = np.zeros(len(a_shape), dtype=int)
|
||||
b_in = np.zeros(len(b_shape), dtype=int)
|
||||
|
||||
for i in prange(len(out)):
|
||||
to_index(i, out_shape, out_index)
|
||||
index = index_to_position(out_index, out_strides)
|
||||
|
||||
broadcast_index(out_index, out_shape, a_shape, a_in)
|
||||
a = a_storage[index_to_position(a_in, a_strides)]
|
||||
|
||||
broadcast_index(out_index, out_shape, b_shape, b_in)
|
||||
b = b_storage[index_to_position(b_in, b_strides)]
|
||||
|
||||
out[index] = fn(a, b)
|
||||
|
||||
return njit(parallel=True)(_zip)
|
||||
|
||||
|
||||
def zip(fn):
|
||||
"""
|
||||
Higher-order tensor zip function.
|
||||
|
||||
fn_zip = zip(fn)
|
||||
c = fn_zip(a, b)
|
||||
|
||||
Args:
|
||||
fn: function from two floats-to-float to apply
|
||||
a (:class:`Tensor`): tensor to zip over
|
||||
b (:class:`Tensor`): tensor to zip over
|
||||
|
||||
Returns:
|
||||
:class:`Tensor` : new tensor data
|
||||
"""
|
||||
f = tensor_zip(njit()(fn))
|
||||
|
||||
def ret(a, b):
|
||||
c_shape = shape_broadcast(a.shape, b.shape)
|
||||
out = a.zeros(c_shape)
|
||||
f(*out.tuple(), *a.tuple(), *b.tuple())
|
||||
return out
|
||||
|
||||
return ret
|
||||
|
||||
|
||||
def tensor_reduce(fn):
|
||||
"""
|
||||
NUMBA higher-order tensor reduce function. See `tensor_ops.py` for description.
|
||||
|
||||
Optimizations:
|
||||
|
||||
* Main loop in parallel
|
||||
* All indices use numpy buffers
|
||||
* Inner-loop should not call any functions or write non-local variables
|
||||
|
||||
Args:
|
||||
fn: reduction function mapping two floats to float.
|
||||
out (array): storage for `out` tensor.
|
||||
out_shape (array): shape for `out` tensor.
|
||||
out_strides (array): strides for `out` tensor.
|
||||
a_storage (array): storage for `a` tensor.
|
||||
a_shape (array): shape for `a` tensor.
|
||||
a_strides (array): strides for `a` tensor.
|
||||
reduce_dim (int): dimension to reduce out
|
||||
|
||||
Returns:
|
||||
None : Fills in `out`
|
||||
|
||||
"""
|
||||
|
||||
def _reduce(out, out_shape, out_strides, a_storage, a_shape, a_strides, reduce_dim):
|
||||
out_index = np.zeros(len(a_shape), dtype=int)
|
||||
for i in prange(len(out)):
|
||||
to_index(i, out_shape, out_index)
|
||||
index = index_to_position(out_index, out_strides)
|
||||
|
||||
for j in range(a_shape[reduce_dim]):
|
||||
a_index = out_index.copy()
|
||||
a_index[reduce_dim] = j
|
||||
|
||||
out[index] = fn(
|
||||
a_storage[index_to_position(a_index, a_strides)], out[index]
|
||||
)
|
||||
|
||||
return njit(parallel=True)(_reduce)
|
||||
|
||||
|
||||
def reduce(fn, start=0.0):
|
||||
"""
|
||||
Higher-order tensor reduce function. ::
|
||||
|
||||
fn_reduce = reduce(fn)
|
||||
out = fn_reduce(a, dim)
|
||||
|
||||
|
||||
Args:
|
||||
fn: function from two floats-to-float to apply
|
||||
a (:class:`Tensor`): tensor to reduce over
|
||||
dim (int): int of dim to reduce
|
||||
|
||||
Returns:
|
||||
:class:`Tensor` : new tensor
|
||||
"""
|
||||
|
||||
f = tensor_reduce(njit()(fn))
|
||||
|
||||
def ret(a, dim):
|
||||
out_shape = list(a.shape)
|
||||
out_shape[dim] = 1
|
||||
|
||||
# Other values when not sum.
|
||||
out = a.zeros(tuple(out_shape))
|
||||
out._tensor._storage[:] = start
|
||||
|
||||
f(*out.tuple(), *a.tuple(), dim)
|
||||
return out
|
||||
|
||||
return ret
|
||||
|
||||
|
||||
@njit(parallel=True, fastmath=True)
|
||||
def tensor_matrix_multiply(
|
||||
out,
|
||||
out_shape,
|
||||
out_strides,
|
||||
a_storage,
|
||||
a_shape,
|
||||
a_strides,
|
||||
b_storage,
|
||||
b_shape,
|
||||
b_strides,
|
||||
):
|
||||
"""
|
||||
NUMBA tensor matrix multiply function.
|
||||
|
||||
Should work for any tensor shapes that broadcast as long as ::
|
||||
|
||||
assert a_shape[-1] == b_shape[-2]
|
||||
|
||||
Optimizations:
|
||||
|
||||
* Outer loop in parallel
|
||||
* No index buffers or function calls
|
||||
* Inner loop should have no global writes, 1 multiply.
|
||||
|
||||
|
||||
Args:
|
||||
out (array): storage for `out` tensor
|
||||
out_shape (array): shape for `out` tensor
|
||||
out_strides (array): strides for `out` tensor
|
||||
a_storage (array): storage for `a` tensor
|
||||
a_shape (array): shape for `a` tensor
|
||||
a_strides (array): strides for `a` tensor
|
||||
b_storage (array): storage for `b` tensor
|
||||
b_shape (array): shape for `b` tensor
|
||||
b_strides (array): strides for `b` tensor
|
||||
|
||||
Returns:
|
||||
None : Fills in `out`
|
||||
"""
|
||||
a_batch_stride = a_strides[0] if a_shape[0] > 1 else 0
|
||||
b_batch_stride = b_strides[0] if b_shape[0] > 1 else 0
|
||||
|
||||
assert a_shape[-1] == b_shape[-2]
|
||||
|
||||
for i in prange(len(out)):
|
||||
out_0 = i // (out_shape[-1] * out_shape[-2])
|
||||
out_1 = (i % (out_shape[-1] * out_shape[-2])) // out_shape[-1]
|
||||
out_2 = i % out_shape[-1]
|
||||
|
||||
out_i = (
|
||||
out_0 * out_strides[0] +
|
||||
out_1 * out_strides[1] +
|
||||
out_2 * out_strides[2]
|
||||
)
|
||||
|
||||
a_start = out_0 * a_batch_stride + out_1 * a_strides[1]
|
||||
b_start = out_0 * b_batch_stride + out_2 * a_strides[2]
|
||||
|
||||
temp = 0
|
||||
for position in range(a_shape[-1]):
|
||||
temp += (
|
||||
a_storage[a_start + position * a_strides[2]] *
|
||||
b_storage[b_start + position * b_strides[1]]
|
||||
)
|
||||
out[out_i] = temp
|
||||
|
||||
|
||||
|
||||
def matrix_multiply(a, b):
|
||||
"""
|
||||
Batched tensor matrix multiply ::
|
||||
|
||||
for n:
|
||||
for i:
|
||||
for j:
|
||||
for k:
|
||||
out[n, i, j] += a[n, i, k] * b[n, k, j]
|
||||
|
||||
Where n indicates an optional broadcasted batched dimension.
|
||||
|
||||
Should work for tensor shapes of 3 dims ::
|
||||
|
||||
assert a.shape[-1] == b.shape[-2]
|
||||
|
||||
Args:
|
||||
a (:class:`Tensor`): tensor data a
|
||||
b (:class:`Tensor`): tensor data b
|
||||
|
||||
Returns:
|
||||
:class:`Tensor` : new tensor data
|
||||
"""
|
||||
|
||||
# Make these always be a 3 dimensional multiply
|
||||
both_2d = 0
|
||||
if len(a.shape) == 2:
|
||||
a = a.contiguous().view(1, a.shape[0], a.shape[1])
|
||||
both_2d += 1
|
||||
if len(b.shape) == 2:
|
||||
b = b.contiguous().view(1, b.shape[0], b.shape[1])
|
||||
both_2d += 1
|
||||
both_2d = both_2d == 2
|
||||
|
||||
ls = list(shape_broadcast(a.shape[:-2], b.shape[:-2]))
|
||||
ls.append(a.shape[-2])
|
||||
ls.append(b.shape[-1])
|
||||
assert a.shape[-1] == b.shape[-2]
|
||||
out = a.zeros(tuple(ls))
|
||||
|
||||
tensor_matrix_multiply(*out.tuple(), *a.tuple(), *b.tuple())
|
||||
|
||||
# Undo 3d if we added it.
|
||||
if both_2d:
|
||||
out = out.view(out.shape[1], out.shape[2])
|
||||
return out
|
||||
|
||||
|
||||
class FastOps:
|
||||
map = map
|
||||
zip = zip
|
||||
reduce = reduce
|
||||
matrix_multiply = matrix_multiply
|
||||
@ -0,0 +1,162 @@
|
||||
class Module:
|
||||
"""
|
||||
Modules form a tree that store parameters and other
|
||||
submodules. They make up the basis of neural network stacks.
|
||||
|
||||
Attributes:
|
||||
_modules (dict of name x :class:`Module`): Storage of the child modules
|
||||
_parameters (dict of name x :class:`Parameter`): Storage of the module's parameters
|
||||
training (bool): Whether the module is in training mode or evaluation mode
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self._modules = {}
|
||||
self._parameters = {}
|
||||
self.training = True
|
||||
|
||||
def modules(self):
|
||||
"Return the direct child modules of this module."
|
||||
return self.__dict__["_modules"].values()
|
||||
|
||||
def train(self):
|
||||
"Set the mode of this module and all descendent modules to `train`."
|
||||
|
||||
def _train(module):
|
||||
module.training = True
|
||||
for m in module.modules():
|
||||
_train(m)
|
||||
|
||||
_train(self)
|
||||
|
||||
def eval(self):
|
||||
"Set the mode of this module and all descendent modules to `eval`."
|
||||
|
||||
def _eval(module):
|
||||
module.training = False
|
||||
for m in module.modules():
|
||||
_eval(m)
|
||||
|
||||
_eval(self)
|
||||
|
||||
def named_parameters(self):
|
||||
"""
|
||||
Collect all the parameters of this module and its descendents.
|
||||
|
||||
|
||||
Returns:
|
||||
list of pairs: Contains the name and :class:`Parameter` of each ancestor parameter.
|
||||
"""
|
||||
|
||||
def _named_parameters(module, prefix=""):
|
||||
for name, param in module._parameters.items():
|
||||
yield prefix + name, param
|
||||
for name, module in module._modules.items():
|
||||
yield from _named_parameters(module, prefix + name + ".")
|
||||
|
||||
return list(_named_parameters(self))
|
||||
|
||||
def parameters(self):
|
||||
"Enumerate over all the parameters of this module and its descendents."
|
||||
|
||||
# def _parameters(module):
|
||||
# for param in module._parameters.values():
|
||||
# yield param
|
||||
# for module in module._modules.values():
|
||||
# yield from _parameters(module)
|
||||
|
||||
return [param for _, param in self.named_parameters()]
|
||||
|
||||
def add_parameter(self, k, v):
|
||||
"""
|
||||
Manually add a parameter. Useful helper for scalar parameters.
|
||||
|
||||
Args:
|
||||
k (str): Local name of the parameter.
|
||||
v (value): Value for the parameter.
|
||||
|
||||
Returns:
|
||||
Parameter: Newly created parameter.
|
||||
"""
|
||||
val = Parameter(v, k)
|
||||
self.__dict__["_parameters"][k] = val
|
||||
return val
|
||||
|
||||
def __setattr__(self, key, val):
|
||||
if isinstance(val, Parameter):
|
||||
self.__dict__["_parameters"][key] = val
|
||||
elif isinstance(val, Module):
|
||||
self.__dict__["_modules"][key] = val
|
||||
else:
|
||||
super().__setattr__(key, val)
|
||||
|
||||
def __getattr__(self, key):
|
||||
if key in self.__dict__["_parameters"]:
|
||||
return self.__dict__["_parameters"][key]
|
||||
|
||||
if key in self.__dict__["_modules"]:
|
||||
return self.__dict__["_modules"][key]
|
||||
|
||||
def __call__(self, *args, **kwargs):
|
||||
return self.forward(*args, **kwargs)
|
||||
|
||||
def forward(self):
|
||||
assert False, "Not Implemented"
|
||||
|
||||
def __repr__(self):
|
||||
def _addindent(s_, numSpaces):
|
||||
s = s_.split("\n")
|
||||
if len(s) == 1:
|
||||
return s_
|
||||
first = s.pop(0)
|
||||
s = [(numSpaces * " ") + line for line in s]
|
||||
s = "\n".join(s)
|
||||
s = first + "\n" + s
|
||||
return s
|
||||
|
||||
child_lines = []
|
||||
|
||||
for key, module in self._modules.items():
|
||||
mod_str = repr(module)
|
||||
mod_str = _addindent(mod_str, 2)
|
||||
child_lines.append("(" + key + "): " + mod_str)
|
||||
lines = child_lines
|
||||
|
||||
main_str = self.__class__.__name__ + "("
|
||||
if lines:
|
||||
# simple one-liner info, which most builtin Modules will use
|
||||
main_str += "\n " + "\n ".join(lines) + "\n"
|
||||
|
||||
main_str += ")"
|
||||
return main_str
|
||||
|
||||
|
||||
class Parameter:
|
||||
"""
|
||||
A Parameter is a special container stored in a :class:`Module`.
|
||||
|
||||
It is designed to hold a :class:`Variable`, but we allow it to hold
|
||||
any value for testing.
|
||||
"""
|
||||
|
||||
def __init__(self, x=None, name=None):
|
||||
self.value = x
|
||||
self.name = name
|
||||
if hasattr(x, "requires_grad_"):
|
||||
self.value.requires_grad_(True)
|
||||
if self.name:
|
||||
self.value.name = self.name
|
||||
|
||||
def update(self, x):
|
||||
"Update the parameter value."
|
||||
self.value = x
|
||||
if hasattr(x, "requires_grad_"):
|
||||
self.value.requires_grad_(True)
|
||||
if self.name:
|
||||
self.value.name = self.name
|
||||
|
||||
def __repr__(self):
|
||||
return repr(self.value)
|
||||
|
||||
def __str__(self):
|
||||
return str(self.value)
|
||||
@ -0,0 +1,60 @@
|
||||
# from .tensor import rand
|
||||
# from .functions import matmul, conv2d
|
||||
# from .module import Module, Parameter
|
||||
|
||||
|
||||
# class tLinear(Module):
|
||||
# def __init__(self, in_size, out_size):
|
||||
# super().__init__()
|
||||
# self.weights = Parameter(rand((in_size, out_size)))
|
||||
# self.bias = Parameter(rand((out_size,)))
|
||||
# self.out_size = out_size
|
||||
|
||||
# def forward(self, x):
|
||||
# batch, in_size = x.shape
|
||||
# return (
|
||||
# self.weights.value.view(1, in_size, self.out_size)
|
||||
# * x.view(batch, in_size, 1)
|
||||
# ).sum(1).view(batch, self.out_size) + self.bias.value.view(1, self.out_size)
|
||||
|
||||
|
||||
# class tLinear2(Module):
|
||||
# def __init__(self, in_size, out_size):
|
||||
# super().__init__()
|
||||
# self.weights = Parameter(rand((in_size, out_size)))
|
||||
# self.bias = Parameter(rand((out_size,)))
|
||||
# self.out_size = out_size
|
||||
|
||||
# def forward(self, x):
|
||||
# batch, in_size = x.shape
|
||||
# return matmul(x, self.weights.value) + self.bias.value.view(1, self.out_size)
|
||||
|
||||
|
||||
# class Dropout(Module):
|
||||
# def __init__(self, rate):
|
||||
# super().__init__()
|
||||
# self.rate = rate
|
||||
|
||||
# def forward(self, x):
|
||||
# return (rand(x.shape) / 2 + 0.5 < self.rate) * x
|
||||
|
||||
|
||||
# class Conv2d(Module):
|
||||
# def __init__(self, in_features, out_features, size):
|
||||
# super().__init__()
|
||||
# size1 = [size[0], size[1], in_features, out_features]
|
||||
# size2 = [size[0], size[1], out_features]
|
||||
# self.weights = Parameter(rand(size1))
|
||||
# self.bias = Parameter(rand(size2))
|
||||
|
||||
# def forward(self, x):
|
||||
# return conv2d(x, self.weights.value, self.bias.value)
|
||||
|
||||
|
||||
# # class MaxPool2d(Module):
|
||||
# # def __init__(self, in_features, out_features, size):
|
||||
# # super().__init__()
|
||||
|
||||
|
||||
# # def forward(self, x):
|
||||
# # return conv2d(x, self.weights.value, self.bias.value)
|
||||
@ -0,0 +1,175 @@
|
||||
from .fast_ops import FastOps
|
||||
from .tensor_functions import rand, Function
|
||||
from . import operators
|
||||
|
||||
|
||||
def tile(input, kernel):
|
||||
"""
|
||||
Reshape an image tensor for 2D pooling
|
||||
|
||||
Args:
|
||||
input (:class:`Tensor`): batch x channel x height x width
|
||||
kernel ( pair of ints ): height x width of pooling
|
||||
|
||||
Returns:
|
||||
(:class:`Tensor`, int, int) : Tensor of size batch x channel x new_height x new_width x (kernel_height * kernel_width) as well as the new_height and new_width value.
|
||||
"""
|
||||
|
||||
batch, channel, height, width = input.shape
|
||||
kh, kw = kernel
|
||||
assert height % kh == 0
|
||||
assert width % kw == 0
|
||||
|
||||
new_height = int(height / kh)
|
||||
new_width = int(width / kw)
|
||||
|
||||
input = (
|
||||
input.contiguous()
|
||||
.view(batch, channel, height, new_width, kw)
|
||||
.permute(0, 1, 3, 2, 4)
|
||||
.contiguous()
|
||||
.view(batch, channel, new_width, new_height, kh * kw)
|
||||
)
|
||||
|
||||
return (input, new_height, new_width)
|
||||
|
||||
|
||||
def avgpool2d(input, kernel):
|
||||
"""
|
||||
Tiled average pooling 2D
|
||||
|
||||
Args:
|
||||
input (:class:`Tensor`): batch x channel x height x width
|
||||
kernel ( pair of ints ): height x width of pooling
|
||||
|
||||
Returns:
|
||||
:class:`Tensor` : pooled tensor
|
||||
"""
|
||||
batch, channel, height, width = input.shape
|
||||
input = (
|
||||
tile(input, kernel).mean(4).view(batch, channel, input.shape[2], input.shape[3])
|
||||
)
|
||||
return input
|
||||
|
||||
|
||||
max_reduce = FastOps.reduce(operators.max, -1e9)
|
||||
|
||||
|
||||
def argmax(input, dim):
|
||||
"""
|
||||
Compute the argmax as a 1-hot tensor.
|
||||
|
||||
Args:
|
||||
input (:class:`Tensor`): input tensor
|
||||
dim (int): dimension to apply argmax
|
||||
|
||||
|
||||
Returns:
|
||||
:class:`Tensor` : tensor with 1 on highest cell in dim, 0 otherwise
|
||||
|
||||
"""
|
||||
out = max_reduce(input, [dim])
|
||||
return out == input
|
||||
|
||||
|
||||
class Max(Function):
|
||||
@staticmethod
|
||||
def forward(ctx, input, dim):
|
||||
"Forward of max should be max reduction"
|
||||
res = max_reduce(input, [dim])
|
||||
ctx.save_for_backward(input, dim)
|
||||
return res
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
"Backward of max should be argmax (see above)"
|
||||
input, dim = ctx.saved_values
|
||||
res = argmax(input, dim)
|
||||
return grad_output * res
|
||||
|
||||
|
||||
max = Max.apply
|
||||
|
||||
|
||||
def softmax(input, dim):
|
||||
r"""
|
||||
Compute the softmax as a tensor.
|
||||
|
||||
.. math::
|
||||
|
||||
z_i = \frac{e^{x_i}}{\sum_i e^{x_i}}
|
||||
|
||||
Args:
|
||||
input (:class:`Tensor`): input tensor
|
||||
dim (int): dimension to apply softmax
|
||||
|
||||
Returns:
|
||||
:class:`Tensor` : softmax tensor
|
||||
"""
|
||||
|
||||
input = input.exp()
|
||||
t = input.sum(dim)
|
||||
input = input / t
|
||||
return input
|
||||
|
||||
|
||||
def logsoftmax(input, dim):
|
||||
r"""
|
||||
Compute the log of the softmax as a tensor.
|
||||
|
||||
.. math::
|
||||
|
||||
z_i = x_i - \log \sum_i e^{x_i}
|
||||
|
||||
See https://en.wikipedia.org/wiki/LogSumExp#log-sum-exp_trick_for_log-domain_calculations
|
||||
|
||||
Args:
|
||||
input (:class:`Tensor`): input tensor
|
||||
dim (int): dimension to apply log-softmax
|
||||
|
||||
Returns:
|
||||
:class:`Tensor` : log of softmax tensor
|
||||
"""
|
||||
|
||||
t = input.exp()
|
||||
t = t.sum(dim)
|
||||
t = t.log()
|
||||
return input - t
|
||||
|
||||
|
||||
def maxpool2d(input, kernel):
|
||||
"""
|
||||
Tiled max pooling 2D
|
||||
|
||||
Args:
|
||||
input (:class:`Tensor`): batch x channel x height x width
|
||||
kernel ( pair of ints ): height x width of pooling
|
||||
|
||||
Returns:
|
||||
:class:`Tensor` : pooled tensor
|
||||
"""
|
||||
batch, channel, height, width = input.shape
|
||||
input = tile(input, kernel)
|
||||
input = max(input, 4)
|
||||
input = input.view(batch, channel, int(height / kernel[0]), int(width / kernel[1]))
|
||||
return input
|
||||
|
||||
|
||||
def dropout(input, rate, ignore=False):
|
||||
"""
|
||||
Dropout positions based on random noise.
|
||||
|
||||
Args:
|
||||
input (:class:`Tensor`): input tensor
|
||||
rate (float): probability [0, 1) of dropping out each position
|
||||
ignore (bool): skip dropout, i.e. do nothing at all
|
||||
|
||||
Returns:
|
||||
:class:`Tensor` : tensor with random positions dropped out
|
||||
"""
|
||||
if not ignore:
|
||||
rand_tensor = rand(input.shape)
|
||||
rand_drop = rand_tensor > rate
|
||||
return input * rand_drop
|
||||
else:
|
||||
return input
|
||||
@ -0,0 +1,212 @@
|
||||
"""
|
||||
Collection of the core mathematical operators used throughout the code base.
|
||||
"""
|
||||
|
||||
from glob import glob
|
||||
import math
|
||||
|
||||
# ## Task 0.1
|
||||
|
||||
# Implementation of a prelude of elementary functions.
|
||||
|
||||
|
||||
def mul(x, y):
|
||||
":math:`f(x, y) = x * y`"
|
||||
return x * y
|
||||
|
||||
|
||||
def id(x):
|
||||
":math:`f(x) = x`"
|
||||
return x
|
||||
|
||||
|
||||
def add(x, y):
|
||||
":math:`f(x, y) = x + y`"
|
||||
return x + y
|
||||
|
||||
|
||||
def neg(x):
|
||||
":math:`f(x) = -x`"
|
||||
return float(-x)
|
||||
|
||||
|
||||
def lt(x, y):
|
||||
":math:`f(x) =` 1.0 if x is less than y else 0.0"
|
||||
return 1.0 if x < y else 0.0
|
||||
|
||||
|
||||
def eq(x, y):
|
||||
":math:`f(x) =` 1.0 if x is equal to y else 0.0"
|
||||
return 1.0 if x == y else 0.0
|
||||
|
||||
|
||||
def max(x, y):
|
||||
":math:`f(x) =` x if x is greater than y else y"
|
||||
return x if x > y else y
|
||||
|
||||
|
||||
def is_close(x, y):
|
||||
":math:`f(x) = |x - y| < 1e-2`"
|
||||
return abs(x - y) < 1e-2
|
||||
|
||||
|
||||
def sigmoid(x):
|
||||
r"""
|
||||
:math:`f(x) = \frac{1.0}{(1.0 + e^{-x})}`
|
||||
|
||||
(See `<https://en.wikipedia.org/wiki/Sigmoid_function>`_ .)
|
||||
|
||||
Calculate as
|
||||
|
||||
:math:`f(x) = \frac{1.0}{(1.0 + e^{-x})}` if x >=0 else :math:`\frac{e^x}{(1.0 + e^{x})}`
|
||||
|
||||
for stability.
|
||||
|
||||
Args:
|
||||
x (float): input
|
||||
|
||||
Returns:
|
||||
float : sigmoid value
|
||||
"""
|
||||
return 1.0 / (1.0 + math.exp(-x))
|
||||
|
||||
|
||||
def relu(x):
|
||||
"""
|
||||
:math:`f(x) =` x if x is greater than 0, else 0
|
||||
|
||||
(See `<https://en.wikipedia.org/wiki/Rectifier_(neural_networks)>`_ .)
|
||||
|
||||
Args:
|
||||
x (float): input
|
||||
|
||||
Returns:
|
||||
float : relu value
|
||||
"""
|
||||
return (x > 0) * x
|
||||
|
||||
|
||||
EPS = 1e-6
|
||||
|
||||
|
||||
def log(x):
|
||||
":math:`f(x) = log(x)`"
|
||||
return math.log(x + EPS)
|
||||
|
||||
|
||||
def exp(x):
|
||||
":math:`f(x) = e^{x}`"
|
||||
return math.exp(x)
|
||||
|
||||
|
||||
def log_back(x, d):
|
||||
r"If :math:`f = log` as above, compute d :math:`d \times f'(x)`"
|
||||
return d / x
|
||||
|
||||
|
||||
def inv(x):
|
||||
":math:`f(x) = 1/x`"
|
||||
return 1 / x
|
||||
|
||||
|
||||
def inv_back(x, d):
|
||||
r"If :math:`f(x) = 1/x` compute d :math:`d \times f'(x)`"
|
||||
return -d / x ** 2
|
||||
|
||||
|
||||
def relu_back(x, d):
|
||||
r"If :math:`f = relu` compute d :math:`d \times f'(x)`"
|
||||
return d if x > 0 else 0.0
|
||||
|
||||
|
||||
def sigmoid_back(x, d):
|
||||
return d * exp(-x) / ((1 + exp(-x)) ** 2)
|
||||
|
||||
|
||||
# ## Task 0.3
|
||||
|
||||
# Small library of elementary higher-order functions for practice.
|
||||
|
||||
|
||||
def map(fn):
|
||||
"""
|
||||
Higher-order map.
|
||||
|
||||
.. image:: figs/Ops/maplist.png
|
||||
|
||||
|
||||
See `<https://en.wikipedia.org/wiki/Map_(higher-order_function)>`_
|
||||
|
||||
Args:
|
||||
fn (one-arg function): Function from one value to one value.
|
||||
|
||||
Returns:
|
||||
function : A function that takes a list, applies `fn` to each element, and returns a
|
||||
new list
|
||||
"""
|
||||
return lambda list: [fn(x) for x in list]
|
||||
|
||||
|
||||
def negList(ls):
|
||||
"Use :func:`map` and :func:`neg` to negate each element in `ls`"
|
||||
return map(neg)(ls)
|
||||
|
||||
|
||||
def zipWith(fn):
|
||||
"""
|
||||
Higher-order zipwith (or map2).
|
||||
|
||||
.. image:: figs/Ops/ziplist.png
|
||||
|
||||
See `<https://en.wikipedia.org/wiki/Map_(higher-order_function)>`_
|
||||
|
||||
Args:
|
||||
fn (two-arg function): combine two values
|
||||
|
||||
Returns:
|
||||
function : takes two equally sized lists `ls1` and `ls2`, produce a new list by
|
||||
applying fn(x, y) on each pair of elements.
|
||||
|
||||
"""
|
||||
return lambda ls1, ls2: (fn(x, y) for x, y in zip(ls1, ls2))
|
||||
|
||||
|
||||
def addLists(ls1, ls2):
|
||||
"Add the elements of `ls1` and `ls2` using :func:`zipWith` and :func:`add`"
|
||||
return zipWith(add)(ls1, ls2)
|
||||
|
||||
|
||||
def reduce(fn, start):
|
||||
r"""
|
||||
Higher-order reduce.
|
||||
|
||||
.. image:: figs/Ops/reducelist.png
|
||||
|
||||
|
||||
Args:
|
||||
fn (two-arg function): combine two values
|
||||
start (float): start value :math:`x_0`
|
||||
|
||||
Returns:
|
||||
function : function that takes a list `ls` of elements
|
||||
:math:`x_1 \ldots x_n` and computes the reduction :math:`fn(x_3, fn(x_2,
|
||||
fn(x_1, x_0)))`
|
||||
"""
|
||||
|
||||
def _reduce(ls, fn, start):
|
||||
iterator = iter(ls)
|
||||
for i in iterator:
|
||||
start = fn(start, i)
|
||||
return start
|
||||
|
||||
return lambda ls: _reduce(ls, fn, start)
|
||||
|
||||
|
||||
def sum(ls):
|
||||
"Sum up a list using :func:`reduce` and :func:`add`."
|
||||
return reduce(add, 0)(ls)
|
||||
|
||||
|
||||
def prod(ls):
|
||||
"Product of a list using :func:`reduce` and :func:`mul`."
|
||||
return reduce(mul, 1)(ls)
|
||||
@ -0,0 +1,19 @@
|
||||
class Optimizer:
|
||||
def __init__(self, parameters):
|
||||
self.parameters = parameters
|
||||
|
||||
|
||||
class SGD(Optimizer):
|
||||
def __init__(self, parameters, lr=1.0):
|
||||
super().__init__(parameters)
|
||||
self.lr = lr
|
||||
|
||||
def zero_grad(self):
|
||||
for p in self.parameters:
|
||||
if p.value.derivative is not None:
|
||||
p.value._derivative = None
|
||||
|
||||
def step(self):
|
||||
for p in self.parameters:
|
||||
if p.value.derivative is not None:
|
||||
p.update(p.value - self.lr * p.value.derivative)
|
||||
@ -0,0 +1,314 @@
|
||||
from .autodiff import FunctionBase, Variable, History
|
||||
from . import operators
|
||||
import numpy as np
|
||||
|
||||
|
||||
# ## Task 1.1
|
||||
# Central Difference calculation
|
||||
|
||||
|
||||
def central_difference(f, *vals, arg=0, epsilon=1e-6):
|
||||
r"""
|
||||
Computes an approximation to the derivative of `f` with respect to one arg.
|
||||
|
||||
See :doc:`derivative` or https://en.wikipedia.org/wiki/Finite_difference for more details.
|
||||
|
||||
Args:
|
||||
f : arbitrary function from n-scalar args to one value
|
||||
*vals (list of floats): n-float values :math:`x_0 \ldots x_{n-1}`
|
||||
arg (int): the number :math:`i` of the arg to compute the derivative
|
||||
epsilon (float): a small constant
|
||||
|
||||
Returns:
|
||||
float : An approximation of :math:`f'_i(x_0, \ldots, x_{n-1})`
|
||||
"""
|
||||
|
||||
def _modify_arg(e):
|
||||
return [x + e if i == arg else x for i, x in enumerate(vals)]
|
||||
|
||||
return (f(*_modify_arg(epsilon)) - f(*_modify_arg(-epsilon))) / (2 * epsilon)
|
||||
|
||||
|
||||
# ## Task 1.2 and 1.4
|
||||
# Scalar Forward and Backward
|
||||
|
||||
|
||||
class Scalar(Variable):
|
||||
"""
|
||||
A reimplementation of scalar values for autodifferentiation
|
||||
tracking. Scalar Variables behave as close as possible to standard
|
||||
Python numbers while also tracking the operations that led to the
|
||||
number's creation. They can only be manipulated by
|
||||
:class:`ScalarFunction`.
|
||||
|
||||
Attributes:
|
||||
data (float): The wrapped scalar value.
|
||||
"""
|
||||
|
||||
def __init__(self, v, back=History(), name=None):
|
||||
super().__init__(back, name=name)
|
||||
self.data = float(v)
|
||||
|
||||
def __repr__(self):
|
||||
return "Scalar(%f)" % self.data
|
||||
|
||||
def __mul__(self, b):
|
||||
return Mul.apply(self, b)
|
||||
|
||||
def __truediv__(self, b):
|
||||
return Mul.apply(self, Inv.apply(b))
|
||||
|
||||
def __rtruediv__(self, b):
|
||||
return Mul.apply(b, Inv.apply(self))
|
||||
|
||||
def __add__(self, b):
|
||||
return Add.apply(self, b)
|
||||
|
||||
def __bool__(self):
|
||||
return bool(self.data)
|
||||
|
||||
def __lt__(self, b):
|
||||
return LT.apply(self, b)
|
||||
|
||||
def __gt__(self, b):
|
||||
return LT.apply(b, self)
|
||||
|
||||
def __eq__(self, b):
|
||||
return EQ.apply(self, b)
|
||||
|
||||
def __sub__(self, b):
|
||||
return Add.apply(self, Neg.apply(b))
|
||||
|
||||
def __neg__(self):
|
||||
return Neg.apply(self)
|
||||
|
||||
def log(self):
|
||||
return Log.apply(self)
|
||||
|
||||
def exp(self):
|
||||
return Exp.apply(self)
|
||||
|
||||
def sigmoid(self):
|
||||
return Sigmoid.apply(self)
|
||||
|
||||
def relu(self):
|
||||
return ReLU.apply(self)
|
||||
|
||||
def get_data(self):
|
||||
"Returns the raw float value"
|
||||
return self.data
|
||||
|
||||
|
||||
class ScalarFunction(FunctionBase):
|
||||
"""
|
||||
A wrapper for a mathematical function that processes and produces
|
||||
Scalar variables.
|
||||
|
||||
This is a static class and is never instantiated. We use `class`
|
||||
here to group together the `forward` and `backward` code.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def forward(ctx, *inputs):
|
||||
r"""
|
||||
Forward call, compute :math:`f(x_0 \ldots x_{n-1})`.
|
||||
|
||||
Args:
|
||||
ctx (:class:`Context`): A container object to save
|
||||
any information that may be needed
|
||||
for the call to backward.
|
||||
*inputs (list of floats): n-float values :math:`x_0 \ldots x_{n-1}`.
|
||||
|
||||
Should return float the computation of the function :math:`f`.
|
||||
"""
|
||||
pass # pragma: no cover
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, d_out):
|
||||
r"""
|
||||
Backward call, computes :math:`f'_{x_i}(x_0 \ldots x_{n-1}) \times d_{out}`.
|
||||
|
||||
Args:
|
||||
ctx (Context): A container object holding any information saved during in the corresponding `forward` call.
|
||||
d_out (float): :math:`d_out` term in the chain rule.
|
||||
|
||||
Should return the computation of the derivative function
|
||||
:math:`f'_{x_i}` for each input :math:`x_i` times `d_out`.
|
||||
|
||||
"""
|
||||
pass # pragma: no cover
|
||||
|
||||
# Checks.
|
||||
variable = Scalar
|
||||
data_type = float
|
||||
|
||||
@staticmethod
|
||||
def data(a):
|
||||
return a
|
||||
|
||||
|
||||
# Examples
|
||||
class Add(ScalarFunction):
|
||||
"Addition function :math:`f(x, y) = x + y`"
|
||||
|
||||
@staticmethod
|
||||
def forward(ctx, a, b):
|
||||
return a + b
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, d_output):
|
||||
return d_output, d_output
|
||||
|
||||
|
||||
class Log(ScalarFunction):
|
||||
"Log function :math:`f(x) = log(x)`"
|
||||
|
||||
@staticmethod
|
||||
def forward(ctx, a):
|
||||
ctx.save_for_backward(a)
|
||||
return operators.log(a)
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, d_output):
|
||||
a = ctx.saved_values
|
||||
return operators.log_back(a, d_output)
|
||||
|
||||
|
||||
# To implement.
|
||||
|
||||
|
||||
class Mul(ScalarFunction):
|
||||
"Multiplication function"
|
||||
|
||||
@staticmethod
|
||||
def forward(ctx, a, b):
|
||||
ctx.save_for_backward(a, b)
|
||||
return a * b
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, d_output):
|
||||
a, b = ctx.saved_values
|
||||
return b * d_output, a * d_output
|
||||
|
||||
|
||||
class Inv(ScalarFunction):
|
||||
"Inverse function"
|
||||
|
||||
@staticmethod
|
||||
def forward(ctx, a):
|
||||
ctx.save_for_backward(a)
|
||||
return operators.inv(a)
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, d_output):
|
||||
a = ctx.saved_values
|
||||
return operators.inv_back(a, d_output)
|
||||
|
||||
|
||||
class Neg(ScalarFunction):
|
||||
"Negation function"
|
||||
|
||||
@staticmethod
|
||||
def forward(ctx, a):
|
||||
return operators.neg(a)
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, d_output):
|
||||
return operators.neg(d_output)
|
||||
|
||||
|
||||
class Sigmoid(ScalarFunction):
|
||||
"Sigmoid function"
|
||||
|
||||
@staticmethod
|
||||
def forward(ctx, a):
|
||||
ctx.save_for_backward(a)
|
||||
return operators.sigmoid(a)
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, d_output):
|
||||
a = ctx.saved_values
|
||||
return operators.sigmoid_back(a, d_output)
|
||||
|
||||
|
||||
class ReLU(ScalarFunction):
|
||||
"ReLU function"
|
||||
|
||||
@staticmethod
|
||||
def forward(ctx, a):
|
||||
ctx.save_for_backward(a)
|
||||
return operators.relu(a)
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, d_output):
|
||||
a = ctx.saved_values
|
||||
return operators.relu_back(a, d_output)
|
||||
|
||||
|
||||
class Exp(ScalarFunction):
|
||||
"Exp function"
|
||||
|
||||
@staticmethod
|
||||
def forward(ctx, a):
|
||||
ctx.save_for_backward(a)
|
||||
return operators.exp(a)
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, d_output):
|
||||
a = ctx.saved_values
|
||||
return operators.exp(a) * d_output
|
||||
|
||||
|
||||
class LT(ScalarFunction):
|
||||
"Less-than function :math:`f(x) =` 1.0 if x is less than y else 0.0"
|
||||
|
||||
@staticmethod
|
||||
def forward(ctx, a, b):
|
||||
return operators.lt(a, b)
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, d_output):
|
||||
return 0.0, 0.0
|
||||
|
||||
|
||||
class EQ(ScalarFunction):
|
||||
"Equal function :math:`f(x) =` 1.0 if x is equal to y else 0.0"
|
||||
|
||||
@staticmethod
|
||||
def forward(ctx, a, b):
|
||||
return operators.eq(a, b)
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, d_output):
|
||||
return 0.0, 0.0
|
||||
|
||||
|
||||
def derivative_check(f, *scalars):
|
||||
"""
|
||||
Checks that autodiff works on a python function.
|
||||
Asserts False if derivative is incorrect.
|
||||
|
||||
Parameters:
|
||||
f (function) : function from n-scalars to 1-scalar.
|
||||
*scalars (list of :class:`Scalar`) : n input scalar values.
|
||||
"""
|
||||
for x in scalars:
|
||||
x.requires_grad_(True)
|
||||
out = f(*scalars)
|
||||
out.backward()
|
||||
|
||||
vals = [v for v in scalars]
|
||||
err_msg = """
|
||||
Derivative check at arguments f(%s) and received derivative f'=%f for argument %d,
|
||||
but was expecting derivative f'=%f from central difference."""
|
||||
for i, x in enumerate(scalars):
|
||||
check = central_difference(f, *vals, arg=i)
|
||||
print(str([x.data for x in scalars]), x.derivative, i, check)
|
||||
np.testing.assert_allclose(
|
||||
x.derivative,
|
||||
check.data,
|
||||
1e-2,
|
||||
1e-2,
|
||||
err_msg=err_msg
|
||||
% (str([x.data for x in scalars]), x.derivative, i, check.data),
|
||||
)
|
||||
@ -0,0 +1,239 @@
|
||||
"""
|
||||
Implementation of the core Tensor object for autodifferentiation.
|
||||
"""
|
||||
|
||||
from .autodiff import Variable
|
||||
from .tensor_data import TensorData
|
||||
from . import operators
|
||||
|
||||
|
||||
# This class is very similar to Scalar so we implemented it for you.
|
||||
|
||||
|
||||
class Tensor(Variable):
|
||||
"""
|
||||
Tensor is a generalization of Scalar in that it is a Variable that
|
||||
handles multidimensional arrays.
|
||||
|
||||
Attributes:
|
||||
|
||||
_tensor (:class:`TensorData`) : the tensor data storage
|
||||
backend : backend object used to implement tensor math (see `tensor_functions.py`)
|
||||
"""
|
||||
|
||||
def __init__(self, v, back=None, name=None, backend=None):
|
||||
assert isinstance(v, TensorData)
|
||||
assert backend is not None
|
||||
super().__init__(back, name=name)
|
||||
self._tensor = v
|
||||
self.backend = backend
|
||||
|
||||
def to_numpy(self):
|
||||
"""
|
||||
Returns:
|
||||
narray : converted to numpy array
|
||||
"""
|
||||
return self.contiguous()._tensor._storage.reshape(self.shape)
|
||||
|
||||
# Properties
|
||||
@property
|
||||
def shape(self):
|
||||
"""
|
||||
Returns:
|
||||
tuple : shape of the tensor
|
||||
"""
|
||||
return self._tensor.shape
|
||||
|
||||
@property
|
||||
def size(self):
|
||||
"""
|
||||
Returns:
|
||||
int : size of the tensor
|
||||
"""
|
||||
return self._tensor.size
|
||||
|
||||
@property
|
||||
def dims(self):
|
||||
"""
|
||||
Returns:
|
||||
int : dimensionality of the tensor
|
||||
"""
|
||||
return self._tensor.dims
|
||||
|
||||
def _ensure_tensor(self, b):
|
||||
"Turns a python number into a tensor with the same backend."
|
||||
if isinstance(b, (int, float)):
|
||||
b = Tensor.make([b], (1,), backend=self.backend)
|
||||
else:
|
||||
b._type_(self.backend)
|
||||
return b
|
||||
|
||||
# Functions
|
||||
def __add__(self, b):
|
||||
return self.backend.Add.apply(self, self._ensure_tensor(b))
|
||||
|
||||
def __sub__(self, b):
|
||||
return self.backend.Add.apply(self, -self._ensure_tensor(b))
|
||||
|
||||
def __mul__(self, b):
|
||||
return self.backend.Mul.apply(self, self._ensure_tensor(b))
|
||||
|
||||
def __truediv__(self, b):
|
||||
return self.backend.Mul.apply(
|
||||
self, self.backend.Inv.apply(self._ensure_tensor(b))
|
||||
)
|
||||
|
||||
def __rtruediv__(self, b):
|
||||
return self.backend.Mul.apply(
|
||||
self._ensure_tensor(b), self.backend.Inv.apply(self)
|
||||
)
|
||||
|
||||
def __matmul__(self, b):
|
||||
"Not used until Module 3"
|
||||
return self.backend.MatMul.apply(self, b)
|
||||
|
||||
def __lt__(self, b):
|
||||
return self.backend.LT.apply(self, self._ensure_tensor(b))
|
||||
|
||||
def __eq__(self, b):
|
||||
return self.backend.EQ.apply(self, self._ensure_tensor(b))
|
||||
|
||||
def __gt__(self, b):
|
||||
return self.backend.LT.apply(self._ensure_tensor(b), self)
|
||||
|
||||
def __neg__(self):
|
||||
return self.backend.Neg.apply(self)
|
||||
|
||||
def all(self, dim=None):
|
||||
return self.backend.All.apply(self, dim)
|
||||
|
||||
def is_close(self, y):
|
||||
return self.backend.IsClose.apply(self, y)
|
||||
|
||||
def sigmoid(self):
|
||||
return self.backend.Sigmoid.apply(self)
|
||||
|
||||
def relu(self):
|
||||
return self.backend.ReLU.apply(self)
|
||||
|
||||
def log(self):
|
||||
return self.backend.Log.apply(self)
|
||||
|
||||
def item(self):
|
||||
assert self.size == 1
|
||||
return self[0]
|
||||
|
||||
def exp(self):
|
||||
return self.backend.Exp.apply(self)
|
||||
|
||||
def sum(self, dim=None):
|
||||
"Compute the sum over dimension `dim`"
|
||||
return self.backend.Sum.apply(self, dim)
|
||||
|
||||
def mean(self, dim=None):
|
||||
"Compute the mean over dimension `dim`"
|
||||
if dim is not None:
|
||||
return self.sum(dim) / self.shape[dim]
|
||||
else:
|
||||
return self.sum() / self.size
|
||||
|
||||
def permute(self, *order):
|
||||
"Permute tensor dimensions to *order"
|
||||
return self.backend.Permute.apply(self, order)
|
||||
|
||||
def view(self, *shape):
|
||||
"Change the shape of the tensor to a new shape with the same size"
|
||||
return self.backend.View.apply(self, shape)
|
||||
|
||||
def contiguous(self):
|
||||
"Return a contiguous tensor with the same data"
|
||||
return self.backend.Copy.apply(self)
|
||||
|
||||
def __repr__(self):
|
||||
return self._tensor.to_string()
|
||||
|
||||
def __getitem__(self, key):
|
||||
return self._tensor.get(key)
|
||||
|
||||
def __setitem__(self, key, val):
|
||||
self._tensor.set(key, val)
|
||||
|
||||
@property
|
||||
def grad(self):
|
||||
return self.derivative
|
||||
|
||||
# Internal methods used for autodiff.
|
||||
def _type_(self, backend):
|
||||
self.backend = backend
|
||||
if backend.cuda: # pragma: no cover
|
||||
self._tensor.to_cuda_()
|
||||
|
||||
def _new(self, tensor_data):
|
||||
return Tensor(tensor_data, backend=self.backend)
|
||||
|
||||
@staticmethod
|
||||
def make(storage, shape, strides=None, backend=None):
|
||||
"Create a new tensor from data"
|
||||
return Tensor(TensorData(storage, shape, strides), backend=backend)
|
||||
|
||||
def expand(self, other):
|
||||
"""
|
||||
Method used to allow for backprop over broadcasting.
|
||||
This method is called when the output of `backward`
|
||||
is a different size than the input of `forward`.
|
||||
|
||||
|
||||
Parameters:
|
||||
other (class:`Tensor`): backward tensor (must broadcast with self)
|
||||
|
||||
Returns:
|
||||
Expanded version of `other` with the right derivatives
|
||||
|
||||
"""
|
||||
|
||||
# Case 1: Both the same shape.
|
||||
if self.shape == other.shape:
|
||||
return other
|
||||
|
||||
# Case 2: Backward is a smaller than self. Broadcast up.
|
||||
true_shape = TensorData.shape_broadcast(self.shape, other.shape)
|
||||
buf = self.zeros(true_shape)
|
||||
self.backend._id_map(other, out=buf)
|
||||
if self.shape == true_shape:
|
||||
return buf
|
||||
|
||||
# Case 3: Still different, reduce extra dims.
|
||||
out = buf
|
||||
orig_shape = [1] * (len(out.shape) - len(self.shape)) + list(self.shape)
|
||||
for dim, shape in enumerate(out.shape):
|
||||
if orig_shape[dim] == 1 and shape != 1:
|
||||
out = self.backend._add_reduce(out, dim)
|
||||
assert out.size == self.size, f"{out.shape} {self.shape}"
|
||||
# START CODE CHANGE (2021)
|
||||
return Tensor.make(out._tensor._storage, self.shape, backend=self.backend)
|
||||
# END CODE CHANGE (2021)
|
||||
|
||||
def zeros(self, shape=None):
|
||||
def zero(shape):
|
||||
return Tensor.make(
|
||||
[0] * int(operators.prod(shape)), shape, backend=self.backend
|
||||
)
|
||||
|
||||
if shape is None:
|
||||
out = zero(self.shape)
|
||||
else:
|
||||
out = zero(shape)
|
||||
out._type_(self.backend)
|
||||
return out
|
||||
|
||||
def tuple(self):
|
||||
return self._tensor.tuple()
|
||||
|
||||
def get_data(self):
|
||||
return Tensor(self._tensor, backend=self.backend)
|
||||
|
||||
def backward(self, grad_output=None):
|
||||
if grad_output is None:
|
||||
assert self.shape == (1,), "Must provide grad_output if non-scalar"
|
||||
grad_output = Tensor.make([1.0], (1,), backend=self.backend)
|
||||
super().backward(grad_output)
|
||||
@ -0,0 +1,243 @@
|
||||
import random
|
||||
from .operators import prod
|
||||
from numpy import array, float64, ndarray
|
||||
import numba
|
||||
|
||||
MAX_DIMS = 32
|
||||
|
||||
|
||||
class IndexingError(RuntimeError):
|
||||
"Exception raised for indexing errors."
|
||||
pass
|
||||
|
||||
|
||||
def index_to_position(index, strides):
|
||||
"""
|
||||
Converts a multidimensional tensor `index` into a single-dimensional position in
|
||||
storage based on strides.
|
||||
|
||||
Args:
|
||||
index (array-like): index tuple of ints
|
||||
strides (array-like): tensor strides
|
||||
|
||||
Returns:
|
||||
int : position in storage
|
||||
"""
|
||||
|
||||
assert len(index) == len(strides)
|
||||
return sum([i * s for i, s in zip(index, strides)])
|
||||
|
||||
|
||||
def to_index(ordinal, shape, out_index):
|
||||
"""
|
||||
Convert an `ordinal` to an index in the `shape`.
|
||||
Should ensure that enumerating position 0 ... size of a
|
||||
tensor produces every index exactly once. It
|
||||
may not be the inverse of `index_to_position`.
|
||||
|
||||
Args:
|
||||
ordinal (int): ordinal position to convert.
|
||||
shape (tuple): tensor shape.
|
||||
out_index (array): the index corresponding to position.
|
||||
|
||||
Returns:
|
||||
None : Fills in `out_index`.
|
||||
|
||||
"""
|
||||
|
||||
for i, s in enumerate(shape):
|
||||
product = prod(shape[i:])
|
||||
divisor = product / s
|
||||
index = int(ordinal // divisor)
|
||||
|
||||
ordinal -= index * divisor
|
||||
out_index[i] = index
|
||||
|
||||
|
||||
def broadcast_index(big_index, big_shape, shape, out_index):
|
||||
"""
|
||||
Convert a `big_index` into `big_shape` to a smaller `out_index`
|
||||
into `shape` following broadcasting rules. In this case
|
||||
it may be larger or with more dimensions than the `shape`
|
||||
given. Additional dimensions may need to be mapped to 0 or
|
||||
removed.
|
||||
|
||||
Args:
|
||||
big_index (array-like): multidimensional index of bigger tensor
|
||||
big_shape (array-like): tensor shape of bigger tensor
|
||||
shape (array-like): tensor shape of smaller tensor
|
||||
out_index (array-like): multidimensional index of smaller tensor
|
||||
|
||||
Returns:
|
||||
None : Fills in `out_index`.
|
||||
"""
|
||||
|
||||
for i in range(len(shape)):
|
||||
out_index[i] = big_index[len(big_shape) - len(shape) + i] if shape[i] > 1 else 0
|
||||
|
||||
|
||||
def shape_broadcast(shape1, shape2):
|
||||
"""
|
||||
Broadcast two shapes to create a new union shape.
|
||||
|
||||
Args:
|
||||
shape1 (tuple) : first shape
|
||||
shape2 (tuple) : second shape
|
||||
|
||||
Returns:
|
||||
tuple : broadcasted shape
|
||||
|
||||
Raises:
|
||||
IndexingError : if cannot broadcast
|
||||
"""
|
||||
|
||||
tuple = ()
|
||||
large, small = (shape1, shape2) if len(shape1) > len(shape2) else (shape2, shape1)
|
||||
|
||||
for i in range(len(large)):
|
||||
diff = len(large) - len(small)
|
||||
l = large[i]
|
||||
s = l if i < diff else small[i - diff]
|
||||
l, s = (l, s) if l > s else (s, l)
|
||||
|
||||
if l % s != 0:
|
||||
raise IndexingError()
|
||||
else:
|
||||
tuple += (l,)
|
||||
|
||||
return tuple
|
||||
|
||||
|
||||
def strides_from_shape(shape):
|
||||
layout = [1]
|
||||
offset = 1
|
||||
for s in reversed(shape):
|
||||
layout.append(s * offset)
|
||||
offset = s * offset
|
||||
return tuple(reversed(layout[:-1]))
|
||||
|
||||
|
||||
class TensorData:
|
||||
def __init__(self, storage, shape, strides=None):
|
||||
if isinstance(storage, ndarray):
|
||||
self._storage = storage
|
||||
else:
|
||||
self._storage = array(storage, dtype=float64)
|
||||
|
||||
if strides is None:
|
||||
strides = strides_from_shape(shape)
|
||||
|
||||
assert isinstance(strides, tuple), "Strides must be tuple"
|
||||
assert isinstance(shape, tuple), "Shape must be tuple"
|
||||
if len(strides) != len(shape):
|
||||
raise IndexingError(f"Len of strides {strides} must match {shape}.")
|
||||
self._strides = array(strides)
|
||||
self._shape = array(shape)
|
||||
self.strides = strides
|
||||
self.dims = len(strides)
|
||||
self.size = int(prod(shape))
|
||||
self.shape = shape
|
||||
assert len(self._storage) == self.size
|
||||
|
||||
def to_cuda_(self): # pragma: no cover
|
||||
if not numba.cuda.is_cuda_array(self._storage):
|
||||
self._storage = numba.cuda.to_device(self._storage)
|
||||
|
||||
def is_contiguous(self):
|
||||
"""
|
||||
Check that the layout is contiguous, i.e. outer dimensions have bigger strides than inner dimensions.
|
||||
|
||||
Returns:
|
||||
bool : True if contiguous
|
||||
"""
|
||||
last = 1e9
|
||||
for stride in self._strides:
|
||||
if stride > last:
|
||||
return False
|
||||
last = stride
|
||||
return True
|
||||
|
||||
@staticmethod
|
||||
def shape_broadcast(shape_a, shape_b):
|
||||
return shape_broadcast(shape_a, shape_b)
|
||||
|
||||
def index(self, index):
|
||||
if isinstance(index, int):
|
||||
index = array([index])
|
||||
if isinstance(index, tuple):
|
||||
index = array(index)
|
||||
|
||||
# Check for errors
|
||||
if index.shape[0] != len(self.shape):
|
||||
raise IndexingError(f"Index {index} must be size of {self.shape}.")
|
||||
for i, ind in enumerate(index):
|
||||
if ind >= self.shape[i]:
|
||||
raise IndexingError(f"Index {index} out of range {self.shape}.")
|
||||
if ind < 0:
|
||||
raise IndexingError(f"Negative indexing for {index} not supported.")
|
||||
|
||||
# Call fast indexing.
|
||||
return index_to_position(array(index), self._strides)
|
||||
|
||||
def indices(self):
|
||||
lshape = array(self.shape)
|
||||
out_index = array(self.shape)
|
||||
for i in range(self.size):
|
||||
to_index(i, lshape, out_index)
|
||||
yield tuple(out_index)
|
||||
|
||||
def sample(self):
|
||||
return tuple((random.randint(0, s - 1) for s in self.shape))
|
||||
|
||||
def get(self, key):
|
||||
return self._storage[self.index(key)]
|
||||
|
||||
def set(self, key, val):
|
||||
self._storage[self.index(key)] = val
|
||||
|
||||
def tuple(self):
|
||||
return (self._storage, self._shape, self._strides)
|
||||
|
||||
def permute(self, *order):
|
||||
"""
|
||||
Permute the dimensions of the tensor.
|
||||
|
||||
Args:
|
||||
order (list): a permutation of the dimensions
|
||||
|
||||
Returns:
|
||||
:class:`TensorData`: a new TensorData with the same storage and a new dimension order.
|
||||
"""
|
||||
assert list(sorted(order)) == list(
|
||||
range(len(self.shape))
|
||||
), f"Must give a position to each dimension. Shape: {self.shape} Order: {order}"
|
||||
|
||||
return TensorData(
|
||||
self._storage,
|
||||
tuple(self.shape[x] for x in order),
|
||||
tuple(self.strides[x] for x in order),
|
||||
)
|
||||
|
||||
def to_string(self):
|
||||
s = ""
|
||||
for index in self.indices():
|
||||
l = ""
|
||||
for i in range(len(index) - 1, -1, -1):
|
||||
if index[i] == 0:
|
||||
l = "\n%s[" % ("\t" * i) + l
|
||||
else:
|
||||
break
|
||||
s += l
|
||||
v = self.get(index)
|
||||
s += f"{v:3.2f}"
|
||||
l = ""
|
||||
for i in range(len(index) - 1, -1, -1):
|
||||
if index[i] == self.shape[i] - 1:
|
||||
l += "]"
|
||||
else:
|
||||
break
|
||||
if l:
|
||||
s += l
|
||||
else:
|
||||
s += " "
|
||||
return s
|
||||
@ -0,0 +1,410 @@
|
||||
"""
|
||||
Implementation of the autodifferentiation Functions for Tensor.
|
||||
"""
|
||||
|
||||
|
||||
from .autodiff import FunctionBase
|
||||
from .tensor_ops import TensorOps
|
||||
import numpy as np
|
||||
from . import operators
|
||||
from .tensor import Tensor
|
||||
import random
|
||||
|
||||
|
||||
# Constructors
|
||||
class Function(FunctionBase):
|
||||
data_type = Tensor
|
||||
|
||||
@staticmethod
|
||||
def variable(data, back):
|
||||
return Tensor(data[0], back, backend=data[1])
|
||||
|
||||
@staticmethod
|
||||
def data(a):
|
||||
return (a._tensor, a.backend)
|
||||
|
||||
|
||||
def make_tensor_backend(tensor_ops, is_cuda=False):
|
||||
"""
|
||||
Dynamically construct a tensor backend based on a `tensor_ops` object
|
||||
that implements map, zip, and reduce higher-order functions.
|
||||
|
||||
Args:
|
||||
tensor_ops (:class:`TensorOps`) : tensor operations object see `tensor_ops.py`
|
||||
is_cuda (bool) : is the operations object CUDA / GPU based
|
||||
|
||||
Returns :
|
||||
backend : a collection of tensor functions
|
||||
|
||||
"""
|
||||
# Maps
|
||||
neg_map = tensor_ops.map(operators.neg)
|
||||
sigmoid_map = tensor_ops.map(operators.sigmoid)
|
||||
relu_map = tensor_ops.map(operators.relu)
|
||||
log_map = tensor_ops.map(operators.log)
|
||||
exp_map = tensor_ops.map(operators.exp)
|
||||
id_map = tensor_ops.map(operators.id)
|
||||
inv_map = tensor_ops.map(operators.inv)
|
||||
|
||||
# Zips
|
||||
add_zip = tensor_ops.zip(operators.add)
|
||||
mul_zip = tensor_ops.zip(operators.mul)
|
||||
lt_zip = tensor_ops.zip(operators.lt)
|
||||
eq_zip = tensor_ops.zip(operators.eq)
|
||||
is_close_zip = tensor_ops.zip(operators.is_close)
|
||||
relu_back_zip = tensor_ops.zip(operators.relu_back)
|
||||
log_back_zip = tensor_ops.zip(operators.log_back)
|
||||
inv_back_zip = tensor_ops.zip(operators.inv_back)
|
||||
|
||||
# Reduce
|
||||
add_reduce = tensor_ops.reduce(operators.add, 0.0)
|
||||
mul_reduce = tensor_ops.reduce(operators.mul, 1.0)
|
||||
|
||||
class Backend:
|
||||
cuda = is_cuda
|
||||
_id_map = id_map
|
||||
_add_reduce = add_reduce
|
||||
|
||||
class Neg(Function):
|
||||
@staticmethod
|
||||
def forward(ctx, t1):
|
||||
return neg_map(t1)
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
return neg_map(grad_output)
|
||||
|
||||
class Inv(Function):
|
||||
@staticmethod
|
||||
def forward(ctx, t1):
|
||||
ctx.save_for_backward(t1)
|
||||
return inv_map(t1)
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
t1 = ctx.saved_values
|
||||
return inv_back_zip(t1, grad_output)
|
||||
|
||||
class Add(Function):
|
||||
@staticmethod
|
||||
def forward(ctx, t1, t2):
|
||||
return add_zip(t1, t2)
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
return grad_output, grad_output
|
||||
|
||||
class Mul(Function):
|
||||
@staticmethod
|
||||
def forward(ctx, a, b):
|
||||
ctx.save_for_backward(a, b)
|
||||
return mul_zip(a, b)
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
a, b = ctx.saved_values
|
||||
return b * grad_output, a * grad_output
|
||||
|
||||
class Sigmoid(Function):
|
||||
@staticmethod
|
||||
def forward(ctx, a):
|
||||
ctx.save_for_backward(a)
|
||||
return sigmoid_map(a)
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
a = ctx.saved_values
|
||||
return add_zip(
|
||||
mul_zip(grad_output, sigmoid_map(a)),
|
||||
neg_map(
|
||||
mul_zip(grad_output, mul_zip(sigmoid_map(a), sigmoid_map(a)))
|
||||
),
|
||||
)
|
||||
|
||||
class ReLU(Function):
|
||||
@staticmethod
|
||||
def forward(ctx, a):
|
||||
ctx.save_for_backward(a)
|
||||
return relu_map(a)
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
a = ctx.saved_values
|
||||
return relu_back_zip(a, grad_output)
|
||||
|
||||
class Log(Function):
|
||||
@staticmethod
|
||||
def forward(ctx, a):
|
||||
ctx.save_for_backward(a)
|
||||
return log_map(a)
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
a = ctx.saved_values
|
||||
return log_back_zip(a, grad_output)
|
||||
|
||||
class Exp(Function):
|
||||
@staticmethod
|
||||
def forward(ctx, a):
|
||||
ctx.save_for_backward(a)
|
||||
return exp_map(a)
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
a = ctx.saved_values
|
||||
return mul_zip(exp_map(a), grad_output)
|
||||
|
||||
class Sum(Function):
|
||||
@staticmethod
|
||||
def forward(ctx, a, dim):
|
||||
ctx.save_for_backward(a.shape, dim)
|
||||
if dim is not None:
|
||||
return add_reduce(a, dim)
|
||||
else:
|
||||
return add_reduce(
|
||||
a.contiguous().view(int(operators.prod(a.shape))), 0
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
a_shape, dim = ctx.saved_values
|
||||
if dim is None:
|
||||
out = grad_output.zeros(a_shape)
|
||||
out._tensor._storage[:] = grad_output[0]
|
||||
return out
|
||||
else:
|
||||
return grad_output
|
||||
|
||||
class All(Function):
|
||||
@staticmethod
|
||||
def forward(ctx, a, dim):
|
||||
if dim is not None:
|
||||
return mul_reduce(a, dim)
|
||||
else:
|
||||
return mul_reduce(
|
||||
a.contiguous().view(int(operators.prod(a.shape))), 0
|
||||
)
|
||||
|
||||
class LT(Function):
|
||||
@staticmethod
|
||||
def forward(ctx, a, b):
|
||||
return lt_zip(a, b)
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
return (
|
||||
grad_output.zeros(grad_output.shape),
|
||||
grad_output.zeros(grad_output.shape),
|
||||
)
|
||||
|
||||
class EQ(Function):
|
||||
@staticmethod
|
||||
def forward(ctx, a, b):
|
||||
return eq_zip(a, b)
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
return (
|
||||
grad_output.zeros(grad_output.shape),
|
||||
grad_output.zeros(grad_output.shape),
|
||||
)
|
||||
|
||||
class IsClose(Function):
|
||||
@staticmethod
|
||||
def forward(ctx, a, b):
|
||||
return is_close_zip(a, b)
|
||||
|
||||
class Permute(Function):
|
||||
@staticmethod
|
||||
def forward(ctx, a, order):
|
||||
ctx.save_for_backward(a, order)
|
||||
tens_store = a._tensor.permute(*order)
|
||||
return Tensor.make(
|
||||
tens_store._storage,
|
||||
tens_store.shape,
|
||||
tens_store.strides,
|
||||
backend=a.backend,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
a, order = ctx.saved_values
|
||||
return Tensor.make(
|
||||
grad_output._tensor._storage,
|
||||
a._tensor.shape,
|
||||
a._tensor.strides,
|
||||
backend=grad_output.backend,
|
||||
)
|
||||
|
||||
class View(Function):
|
||||
@staticmethod
|
||||
def forward(ctx, a, shape):
|
||||
ctx.save_for_backward(a.shape)
|
||||
assert a._tensor.is_contiguous(), "Must be contiguous to view"
|
||||
return Tensor.make(a._tensor._storage, shape, backend=a.backend)
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
original = ctx.saved_values
|
||||
return Tensor.make(
|
||||
grad_output._tensor._storage, original, backend=grad_output.backend
|
||||
)
|
||||
|
||||
class Copy(Function):
|
||||
@staticmethod
|
||||
def forward(ctx, a):
|
||||
return id_map(a)
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
return grad_output
|
||||
|
||||
class MatMul(Function):
|
||||
@staticmethod
|
||||
def forward(ctx, t1, t2):
|
||||
ctx.save_for_backward(t1, t2)
|
||||
return tensor_ops.matrix_multiply(t1, t2)
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
t1, t2 = ctx.saved_values
|
||||
|
||||
def transpose(a):
|
||||
order = list(range(a.dims))
|
||||
order[-2], order[-1] = order[-1], order[-2]
|
||||
return a._new(a._tensor.permute(*order))
|
||||
|
||||
return (
|
||||
tensor_ops.matrix_multiply(grad_output, transpose(t2)),
|
||||
tensor_ops.matrix_multiply(transpose(t1), grad_output),
|
||||
)
|
||||
|
||||
return Backend
|
||||
|
||||
|
||||
TensorFunctions = make_tensor_backend(TensorOps)
|
||||
|
||||
|
||||
# Helpers for Constructing tensors
|
||||
def zeros(shape, backend=TensorFunctions):
|
||||
"""
|
||||
Produce a zero tensor of size `shape`.
|
||||
|
||||
Args:
|
||||
shape (tuple): shape of tensor
|
||||
backend (:class:`Backend`): tensor backend
|
||||
|
||||
Returns:
|
||||
:class:`Tensor` : new tensor
|
||||
"""
|
||||
return Tensor.make([0] * int(operators.prod(shape)), shape, backend=backend)
|
||||
|
||||
|
||||
def rand(shape, backend=TensorFunctions, requires_grad=False):
|
||||
"""
|
||||
Produce a random tensor of size `shape`.
|
||||
|
||||
Args:
|
||||
shape (tuple): shape of tensor
|
||||
backend (:class:`Backend`): tensor backend
|
||||
requires_grad (bool): turn on autodifferentiation
|
||||
|
||||
Returns:
|
||||
:class:`Tensor` : new tensor
|
||||
"""
|
||||
vals = [random.random() for _ in range(int(operators.prod(shape)))]
|
||||
tensor = Tensor.make(vals, shape, backend=backend)
|
||||
tensor.requires_grad_(requires_grad)
|
||||
return tensor
|
||||
|
||||
|
||||
def _tensor(ls, shape=None, backend=TensorFunctions, requires_grad=False):
|
||||
"""
|
||||
Produce a tensor with data ls and shape `shape`.
|
||||
|
||||
Args:
|
||||
ls (list): data for tensor
|
||||
shape (tuple): shape of tensor
|
||||
backend (:class:`Backend`): tensor backend
|
||||
requires_grad (bool): turn on autodifferentiation
|
||||
|
||||
Returns:
|
||||
:class:`Tensor` : new tensor
|
||||
"""
|
||||
tensor = Tensor.make(ls, shape, backend=backend)
|
||||
tensor.requires_grad_(requires_grad)
|
||||
return tensor
|
||||
|
||||
|
||||
def tensor(ls, backend=TensorFunctions, requires_grad=False):
|
||||
"""
|
||||
Produce a tensor with data and shape from ls
|
||||
|
||||
Args:
|
||||
ls (list): data for tensor
|
||||
backend (:class:`Backend`): tensor backend
|
||||
requires_grad (bool): turn on autodifferentiation
|
||||
|
||||
Returns:
|
||||
:class:`Tensor` : new tensor
|
||||
"""
|
||||
|
||||
def shape(ls):
|
||||
if isinstance(ls, (list, tuple)):
|
||||
return [len(ls)] + shape(ls[0])
|
||||
else:
|
||||
return []
|
||||
|
||||
def flatten(ls):
|
||||
if isinstance(ls, (list, tuple)):
|
||||
return [y for x in ls for y in flatten(x)]
|
||||
else:
|
||||
return [ls]
|
||||
|
||||
cur = flatten(ls)
|
||||
shape = shape(ls)
|
||||
return _tensor(cur, tuple(shape), backend=backend, requires_grad=requires_grad)
|
||||
|
||||
|
||||
# Gradient check for tensors
|
||||
|
||||
|
||||
def grad_central_difference(f, *vals, arg=0, epsilon=1e-6, ind=None):
|
||||
x = vals[arg]
|
||||
up = zeros(x.shape)
|
||||
up[ind] = epsilon
|
||||
vals1 = [x if j != arg else x + up for j, x in enumerate(vals)]
|
||||
vals2 = [x if j != arg else x - up for j, x in enumerate(vals)]
|
||||
delta = f(*vals1).sum() - f(*vals2).sum()
|
||||
|
||||
return delta[0] / (2.0 * epsilon)
|
||||
|
||||
|
||||
def grad_check(f, *vals):
|
||||
for x in vals:
|
||||
x.requires_grad_(True)
|
||||
x.zero_grad_()
|
||||
random.seed(10)
|
||||
out = f(*vals)
|
||||
out.sum().backward()
|
||||
err_msg = """
|
||||
|
||||
Gradient check error for function %s.
|
||||
|
||||
Input %s
|
||||
|
||||
Received derivative %f for argument %d and index %s,
|
||||
but was expecting derivative %f from central difference.
|
||||
|
||||
"""
|
||||
|
||||
for i, x in enumerate(vals):
|
||||
ind = x._tensor.sample()
|
||||
check = grad_central_difference(f, *vals, arg=i, ind=ind)
|
||||
np.testing.assert_allclose(
|
||||
x.grad[ind],
|
||||
check,
|
||||
1e-2,
|
||||
1e-2,
|
||||
err_msg=err_msg % (f, vals, x.grad[ind], i, ind, check),
|
||||
)
|
||||
@ -0,0 +1,284 @@
|
||||
import numpy as np
|
||||
from .tensor_data import (
|
||||
to_index,
|
||||
index_to_position,
|
||||
broadcast_index,
|
||||
shape_broadcast,
|
||||
MAX_DIMS,
|
||||
)
|
||||
|
||||
|
||||
def tensor_map(fn):
|
||||
"""
|
||||
Low-level implementation of tensor map between
|
||||
tensors with *possibly different strides*.
|
||||
|
||||
Simple version:
|
||||
|
||||
* Fill in the `out` array by applying `fn` to each
|
||||
value of `in_storage` assuming `out_shape` and `in_shape`
|
||||
are the same size.
|
||||
|
||||
Broadcasted version:
|
||||
|
||||
* Fill in the `out` array by applying `fn` to each
|
||||
value of `in_storage` assuming `out_shape` and `in_shape`
|
||||
broadcast. (`in_shape` must be smaller than `out_shape`).
|
||||
|
||||
Args:
|
||||
fn: function from float-to-float to apply
|
||||
out (array): storage for out tensor
|
||||
out_shape (array): shape for out tensor
|
||||
out_strides (array): strides for out tensor
|
||||
in_storage (array): storage for in tensor
|
||||
in_shape (array): shape for in tensor
|
||||
in_strides (array): strides for in tensor
|
||||
|
||||
Returns:
|
||||
None : Fills in `out`
|
||||
"""
|
||||
|
||||
def _map(out, out_shape, out_strides, in_storage, in_shape, in_strides):
|
||||
in_index = np.zeros(len(in_shape), dtype=int)
|
||||
out_index = np.zeros(len(out_shape), dtype=int)
|
||||
|
||||
for i in range(len(out)):
|
||||
to_index(i, out_shape, out_index)
|
||||
broadcast_index(out_index, out_shape, in_shape, in_index)
|
||||
|
||||
x = in_storage[index_to_position(in_index, in_strides)]
|
||||
index = index_to_position(out_index, out_strides)
|
||||
out[index] = fn(x)
|
||||
|
||||
return _map
|
||||
|
||||
|
||||
def map(fn):
|
||||
"""
|
||||
Higher-order tensor map function ::
|
||||
|
||||
fn_map = map(fn)
|
||||
fn_map(a, out)
|
||||
out
|
||||
|
||||
Simple version::
|
||||
|
||||
for i:
|
||||
for j:
|
||||
out[i, j] = fn(a[i, j])
|
||||
|
||||
Broadcasted version (`a` might be smaller than `out`) ::
|
||||
|
||||
for i:
|
||||
for j:
|
||||
out[i, j] = fn(a[i, 0])
|
||||
|
||||
Args:
|
||||
fn: function from float-to-float to apply.
|
||||
a (:class:`TensorData`): tensor to map over
|
||||
out (:class:`TensorData`): optional, tensor data to fill in,
|
||||
should broadcast with `a`
|
||||
|
||||
Returns:
|
||||
:class:`TensorData` : new tensor data
|
||||
"""
|
||||
|
||||
f = tensor_map(fn)
|
||||
|
||||
def ret(a, out=None):
|
||||
if out is None:
|
||||
out = a.zeros(a.shape)
|
||||
f(*out.tuple(), *a.tuple())
|
||||
return out
|
||||
|
||||
return ret
|
||||
|
||||
|
||||
def tensor_zip(fn):
|
||||
"""
|
||||
Low-level implementation of tensor zip between
|
||||
tensors with *possibly different strides*.
|
||||
|
||||
Simple version:
|
||||
|
||||
* Fill in the `out` array by applying `fn` to each
|
||||
value of `a_storage` and `b_storage` assuming `out_shape`
|
||||
and `a_shape` are the same size.
|
||||
|
||||
Broadcasted version:
|
||||
|
||||
* Fill in the `out` array by applying `fn` to each
|
||||
value of `a_storage` and `b_storage` assuming `a_shape`
|
||||
and `b_shape` broadcast to `out_shape`.
|
||||
|
||||
Args:
|
||||
fn: function mapping two floats to float to apply
|
||||
out (array): storage for `out` tensor
|
||||
out_shape (array): shape for `out` tensor
|
||||
out_strides (array): strides for `out` tensor
|
||||
a_storage (array): storage for `a` tensor
|
||||
a_shape (array): shape for `a` tensor
|
||||
a_strides (array): strides for `a` tensor
|
||||
b_storage (array): storage for `b` tensor
|
||||
b_shape (array): shape for `b` tensor
|
||||
b_strides (array): strides for `b` tensor
|
||||
|
||||
Returns:
|
||||
None : Fills in `out`
|
||||
"""
|
||||
|
||||
def _zip(
|
||||
out,
|
||||
out_shape,
|
||||
out_strides,
|
||||
a_storage,
|
||||
a_shape,
|
||||
a_strides,
|
||||
b_storage,
|
||||
b_shape,
|
||||
b_strides,
|
||||
):
|
||||
out_index = np.zeros(len(out_shape), dtype=int)
|
||||
a_in = np.zeros(len(a_shape), dtype=int)
|
||||
b_in = np.zeros(len(b_shape), dtype=int)
|
||||
|
||||
for i in range(len(out)):
|
||||
to_index(i, out_shape, out_index)
|
||||
index = index_to_position(out_index, out_strides)
|
||||
|
||||
broadcast_index(out_index, out_shape, a_shape, a_in)
|
||||
a = a_storage[index_to_position(a_in, a_strides)]
|
||||
|
||||
broadcast_index(out_index, out_shape, b_shape, b_in)
|
||||
b = b_storage[index_to_position(b_in, b_strides)]
|
||||
|
||||
out[index] = fn(a, b)
|
||||
|
||||
return _zip
|
||||
|
||||
|
||||
def zip(fn):
|
||||
"""
|
||||
Higher-order tensor zip function ::
|
||||
|
||||
fn_zip = zip(fn)
|
||||
out = fn_zip(a, b)
|
||||
|
||||
Simple version ::
|
||||
|
||||
for i:
|
||||
for j:
|
||||
out[i, j] = fn(a[i, j], b[i, j])
|
||||
|
||||
Broadcasted version (`a` and `b` might be smaller than `out`) ::
|
||||
|
||||
for i:
|
||||
for j:
|
||||
out[i, j] = fn(a[i, 0], b[0, j])
|
||||
|
||||
|
||||
Args:
|
||||
fn: function from two floats-to-float to apply
|
||||
a (:class:`TensorData`): tensor to zip over
|
||||
b (:class:`TensorData`): tensor to zip over
|
||||
|
||||
Returns:
|
||||
:class:`TensorData` : new tensor data
|
||||
"""
|
||||
|
||||
f = tensor_zip(fn)
|
||||
|
||||
def ret(a, b):
|
||||
if a.shape != b.shape:
|
||||
c_shape = shape_broadcast(a.shape, b.shape)
|
||||
else:
|
||||
c_shape = a.shape
|
||||
out = a.zeros(c_shape)
|
||||
f(*out.tuple(), *a.tuple(), *b.tuple())
|
||||
return out
|
||||
|
||||
return ret
|
||||
|
||||
|
||||
def tensor_reduce(fn):
|
||||
"""
|
||||
Low-level implementation of tensor reduce.
|
||||
|
||||
* `out_shape` will be the same as `a_shape`
|
||||
except with `reduce_dim` turned to size `1`
|
||||
|
||||
Args:
|
||||
fn: reduction function mapping two floats to float
|
||||
out (array): storage for `out` tensor
|
||||
out_shape (array): shape for `out` tensor
|
||||
out_strides (array): strides for `out` tensor
|
||||
a_storage (array): storage for `a` tensor
|
||||
a_shape (array): shape for `a` tensor
|
||||
a_strides (array): strides for `a` tensor
|
||||
reduce_dim (int): dimension to reduce out
|
||||
|
||||
Returns:
|
||||
None : Fills in `out`
|
||||
"""
|
||||
|
||||
def _reduce(out, out_shape, out_strides, a_storage, a_shape, a_strides, reduce_dim):
|
||||
out_index = np.zeros(len(a_shape), dtype=int)
|
||||
for i in range(len(out)):
|
||||
to_index(i, out_shape, out_index)
|
||||
index = index_to_position(out_index, out_strides)
|
||||
|
||||
for j in range(a_shape[reduce_dim]):
|
||||
a_index = out_index.copy()
|
||||
a_index[reduce_dim] = j
|
||||
|
||||
out[index] = fn(
|
||||
a_storage[index_to_position(a_index, a_strides)], out[index]
|
||||
)
|
||||
|
||||
return _reduce
|
||||
|
||||
|
||||
def reduce(fn, start=0.0):
|
||||
"""
|
||||
Higher-order tensor reduce function. ::
|
||||
|
||||
fn_reduce = reduce(fn)
|
||||
out = fn_reduce(a, dim)
|
||||
|
||||
Simple version ::
|
||||
|
||||
for j:
|
||||
out[1, j] = start
|
||||
for i:
|
||||
out[1, j] = fn(out[1, j], a[i, j])
|
||||
|
||||
|
||||
Args:
|
||||
fn: function from two floats-to-float to apply
|
||||
a (:class:`TensorData`): tensor to reduce over
|
||||
dim (int): int of dim to reduce
|
||||
|
||||
Returns:
|
||||
:class:`TensorData` : new tensor
|
||||
"""
|
||||
|
||||
f = tensor_reduce(fn)
|
||||
|
||||
def ret(a, dim):
|
||||
out_shape = list(a.shape)
|
||||
out_shape[dim] = 1
|
||||
|
||||
# Other values when not sum.
|
||||
out = a.zeros(tuple(out_shape))
|
||||
out._tensor._storage[:] = start
|
||||
|
||||
f(*out.tuple(), *a.tuple(), dim)
|
||||
return out
|
||||
|
||||
return ret
|
||||
|
||||
|
||||
class TensorOps:
|
||||
map = map
|
||||
zip = zip
|
||||
reduce = reduce
|
||||
@ -0,0 +1,192 @@
|
||||
import minitorch.operators as operators
|
||||
|
||||
|
||||
class MathTest:
|
||||
@staticmethod
|
||||
def neg(a):
|
||||
"Negate the argument"
|
||||
return -a
|
||||
|
||||
@staticmethod
|
||||
def addConstant(a):
|
||||
"Add contant to the argument"
|
||||
return 5 + a
|
||||
|
||||
@staticmethod
|
||||
def square(a):
|
||||
"Manual square"
|
||||
return a * a
|
||||
|
||||
@staticmethod
|
||||
def cube(a):
|
||||
"Manual cube"
|
||||
return a * a * a
|
||||
|
||||
@staticmethod
|
||||
def subConstant(a):
|
||||
"Subtract a constant from the argument"
|
||||
return a - 5
|
||||
|
||||
@staticmethod
|
||||
def multConstant(a):
|
||||
"Multiply a constant to the argument"
|
||||
return 5 * a
|
||||
|
||||
@staticmethod
|
||||
def div(a):
|
||||
"Divide by a constant"
|
||||
return a / 5
|
||||
|
||||
@staticmethod
|
||||
def inv(a):
|
||||
"Invert after adding"
|
||||
return operators.inv(a + 3.5)
|
||||
|
||||
@staticmethod
|
||||
def sig(a):
|
||||
"Apply sigmoid"
|
||||
return operators.sigmoid(a)
|
||||
|
||||
@staticmethod
|
||||
def log(a):
|
||||
"Apply log to a large value"
|
||||
return operators.log(a + 100000)
|
||||
|
||||
@staticmethod
|
||||
def relu(a):
|
||||
"Apply relu"
|
||||
return operators.relu(a + 5.5)
|
||||
|
||||
@staticmethod
|
||||
def exp(a):
|
||||
"Apply exp to a smaller value"
|
||||
return operators.exp(a - 200)
|
||||
|
||||
@staticmethod
|
||||
def explog(a):
|
||||
return operators.log(a + 100000) + operators.exp(a - 200)
|
||||
|
||||
@staticmethod
|
||||
def add2(a, b):
|
||||
"Add two arguments"
|
||||
return a + b
|
||||
|
||||
@staticmethod
|
||||
def mul2(a, b):
|
||||
"Mul two arguments"
|
||||
return a * b
|
||||
|
||||
@staticmethod
|
||||
def div2(a, b):
|
||||
"Divide two arguments"
|
||||
return a / (b + 5.5)
|
||||
|
||||
@staticmethod
|
||||
def gt2(a, b):
|
||||
return operators.lt(b, a + 1.2)
|
||||
|
||||
@staticmethod
|
||||
def lt2(a, b):
|
||||
return operators.lt(a + 1.2, b)
|
||||
|
||||
@staticmethod
|
||||
def eq2(a, b):
|
||||
return operators.eq(a, (b + 5.5))
|
||||
|
||||
@staticmethod
|
||||
def sum_red(a):
|
||||
return operators.sum(a)
|
||||
|
||||
@staticmethod
|
||||
def mean_red(a):
|
||||
return operators.sum(a) / float(len(a))
|
||||
|
||||
@staticmethod
|
||||
def mean_full_red(a):
|
||||
return operators.sum(a) / float(len(a))
|
||||
|
||||
@staticmethod
|
||||
def complex(a):
|
||||
return (
|
||||
operators.log(
|
||||
operators.sigmoid(
|
||||
operators.relu(operators.relu(a * 10 + 7) * 6 + 5) * 10
|
||||
)
|
||||
)
|
||||
/ 50
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _tests(cls):
|
||||
"""
|
||||
Returns a list of all the math tests.
|
||||
"""
|
||||
one_arg = []
|
||||
two_arg = []
|
||||
red_arg = []
|
||||
for k in dir(MathTest):
|
||||
if callable(getattr(MathTest, k)) and not k.startswith("_"):
|
||||
base_fn = getattr(MathTest, k)
|
||||
scalar_fn = getattr(cls, k)
|
||||
tup = (k, base_fn, scalar_fn)
|
||||
if k.endswith("2"):
|
||||
two_arg.append(tup)
|
||||
elif k.endswith("red"):
|
||||
red_arg.append(tup)
|
||||
else:
|
||||
one_arg.append(tup)
|
||||
return one_arg, two_arg, red_arg
|
||||
|
||||
|
||||
class MathTestVariable(MathTest):
|
||||
@staticmethod
|
||||
def inv(a):
|
||||
return 1.0 / (a + 3.5)
|
||||
|
||||
@staticmethod
|
||||
def sig(x):
|
||||
return x.sigmoid()
|
||||
|
||||
@staticmethod
|
||||
def log(x):
|
||||
return (x + 100000).log()
|
||||
|
||||
@staticmethod
|
||||
def relu(x):
|
||||
return (x + 5.5).relu()
|
||||
|
||||
@staticmethod
|
||||
def exp(a):
|
||||
return (a - 200).exp()
|
||||
|
||||
@staticmethod
|
||||
def explog(a):
|
||||
return (a + 100000).log() + (a - 200).exp()
|
||||
|
||||
@staticmethod
|
||||
def sum_red(a):
|
||||
return a.sum(0)
|
||||
|
||||
@staticmethod
|
||||
def mean_red(a):
|
||||
return a.mean(0)
|
||||
|
||||
@staticmethod
|
||||
def mean_full_red(a):
|
||||
return a.mean()
|
||||
|
||||
@staticmethod
|
||||
def eq2(a, b):
|
||||
return a == (b + 5.5)
|
||||
|
||||
@staticmethod
|
||||
def gt2(a, b):
|
||||
return a + 1.2 > b
|
||||
|
||||
@staticmethod
|
||||
def lt2(a, b):
|
||||
return a + 1.2 < b
|
||||
|
||||
@staticmethod
|
||||
def complex(a):
|
||||
return (((a * 10 + 7).relu() * 6 + 5).relu() * 10).sigmoid().log() / 50
|
||||
Loading…
Reference in new issue