23 changed files with 3987 additions and 169 deletions
--- a/README.md
+++ b/README.md
@ -1,2 +0,0 @@
-# Final2X
-
--- a/BIN
+++ b/BIN
--- a/doc/Micrograd_泛读报告.docx
+++ b/doc/Micrograd_泛读报告.docx
--- a/src/Micrograd_Core_Analysis.py
+++ b/src/Micrograd_Core_Analysis.py
@ -1,167 +0,0 @@
-import math
-
-# =============================================================================
-# Micrograd Core Engine
-# Author: Andrej Karpathy
-# Analysis Copy for Educational Purpose
-# =============================================================================
-
-class Value:
-    """ 
-    核心类：Value
-    它封装了一个标量(scalar)值，并跟踪其梯度(gradient)以及生成它的计算历史。
-    这实际上就是 PyTorch 中 Tensor 的微缩版。
-    """
-
-    def __init__(self, data, _children=(), _op=''):
-        self.data = data            # 存储实际的数值
-        self.grad = 0               # 存储关于最终损失函数的导数(梯度)
-        
-        # --- 自动微分图构建所需的内部变量 ---
-        # _backward 是一个函数闭包，存储了如何求该节点梯度的逻辑
-        self._backward = lambda: None
-        # _prev 存储了产生当前节点的父节点集合（用于构建计算图 DAG）
-        self._prev = set(_children)
-        # _op 记录了生成该节点的操作符（如 +, *, ReLU 等），用于调试可视化
-        self._op = _op 
-
-    def __add__(self, other):
-        """实现加法: self + other"""
-        other = other if isinstance(other, Value) else Value(other)
-        
-        # 前向传播 (Forward Pass): 计算结果值
-        out = Value(self.data + other.data, (self, other), '+')
-
-        # 反向传播 (Backward Pass): 定义局部梯度计算逻辑
-        # 链式法则: out = a + b
-        # d(out)/da = 1, d(out)/db = 1
-        # 所以局部梯度直接传递给父节点
-        def _backward():
-            self.grad += 1.0 * out.grad
-            other.grad += 1.0 * out.grad
-        out._backward = _backward
-
-        return out
-
-    def __mul__(self, other):
-        """实现乘法: self * other"""
-        other = other if isinstance(other, Value) else Value(other)
-        
-        # 前向传播
-        out = Value(self.data * other.data, (self, other), '*')
-
-        # 反向传播
-        # 链式法则: out = a * b
-        # d(out)/da = b
-        # d(out)/db = a
-        def _backward():
-            self.grad += other.data * out.grad
-            other.grad += self.data * out.grad
-        out._backward = _backward
-
-        return out
-
-    def __pow__(self, other):
-        """实现幂运算: self ** other"""
-        assert isinstance(other, (int, float)), "only supporting int/float powers for now"
-        out = Value(self.data**other, (self,), f'**{other}')
-
-        # 链式法则: out = a^b
-        # d(out)/da = b * a^(b-1)
-        def _backward():
-            self.grad += (other * self.data**(other-1)) * out.grad
-        out._backward = _backward
-
-        return out
-
-    def relu(self):
-        """实现激活函数 ReLU: max(0, x)"""
-        out = Value(0 if self.data < 0 else self.data, (self,), 'ReLU')
-
-        # 链式法则: 
-        # 如果 x > 0, 导数为 1; 否则为 0
-        def _backward():
-            self.grad += (out.data > 0) * out.grad
-        out._backward = _backward
-
-        return out
-
-    def backward(self):
-        """
-        核心调度函数：执行反向传播
-        从当前节点（通常是 Loss）开始，自动计算所有前驱节点的梯度
-        """
-
-        # 1. 拓扑排序 (Topological Sort)
-        # 必须确保在计算节点 v 的梯度前，所有依赖 v 的节点的梯度都已经计算完毕
-        topo = []
-        visited = set()
-        def build_topo(v):
-            if v not in visited:
-                visited.add(v)
-                for child in v._prev:
-                    build_topo(child)
-                topo.append(v)
-        build_topo(self)
-
-        # 2. 按照拓扑序的逆序应用链式法则
-        self.grad = 1 # 最终输出对自己的导数永远是 1
-        for v in reversed(topo):
-            v._backward()
-
-    # --- 以下是辅助运算符重载，方便像数学公式一样写代码 ---
-
-    def __neg__(self): # -self
-        return self * -1
-
-    def __radd__(self, other): # other + self
-        return self + other
-
-    def __sub__(self, other): # self - other
-        return self + (-other)
-
-    def __rsub__(self, other): # other - self
-        return other + (-self)
-
-    def __rmul__(self, other): # other * self
-        return self * other
-
-    def __truediv__(self, other): # self / other
-        return self * other**-1
-
-    def __rtruediv__(self, other): # other / self
-        return other * self**-1
-
-    def __repr__(self):
-        return f"Value(data={self.data}, grad={self.grad})"
-
-# =============================================================================
-# 使用示例
-# =============================================================================
-if __name__ == '__main__':
-    # 定义简单的计算图
-    # inputs
-    x1 = Value(2.0, _op='x1')
-    x2 = Value(0.0, _op='x2')
-    # weights
-    w1 = Value(-3.0, _op='w1')
-    w2 = Value(1.0, _op='w2')
-    # bias
-    b = Value(6.8813735870195432, _op='b')
-    
-    # x1*w1 + x2*w2 + b
-    x1w1 = x1*w1; x1w1._op = 'x1*w1'
-    x2w2 = x2*w2; x2w2._op = 'x2*w2'
-    x1w1x2w2 = x1w1 + x2w2; x1w1x2w2._op = 'x1*w1 + x2*w2'
-    n = x1w1x2w2 + b; n._op = 'n'
-    
-    # activation function
-    o = n.relu(); o._op = 'o'
-    
-    # backward pass
-    o.backward()
-    
-    print("Execution Graph Result:")
-    print(f"Output (o): {o.data}")
-    print(f"Gradient w1: {w1.grad}")
-    print(f"Gradient x1: {x1.grad}")
--- a/src/init.py
+++ b/src/init.py
@ -0,0 +1,28 @@
+from .datasets import datasets  # noqa: F401,F403
+from .testing import MathTest, MathTestVariable  # noqa: F401,F403
+
+# MODULE 0
+from .module import *  # noqa: F401,F403
+
+
+# MODULE 1
+from .scalar import *  # noqa: F401,F403
+from .autodiff import *  # noqa: F401,F403
+
+# MODULE 2
+from .tensor import *  # noqa: F401,F403
+from .tensor_data import *  # noqa: F401,F403
+from .tensor_functions import *  # noqa: F401,F403
+from .tensor_ops import *  # noqa: F401,F403
+
+# MODULE 3
+from .fast_ops import *  # noqa: F401,F403
+from .cuda_ops import *  # noqa: F401,F403
+
+# MODULE 4
+from .nn import *  # noqa: F401,F403
+from .fast_conv import *  # noqa: F401,F403
+
+from .optim import *  # noqa: F401,F403
+
+version = "0.2"
--- a/src/autodiff.py
+++ b/src/autodiff.py
@ -0,0 +1,346 @@
+from collections import defaultdict
+
+variable_count = 1
+
+
+# ## Module 1
+
+# Variable is the main class for autodifferentiation logic for scalars
+# and tensors.
+
+
+class Variable:
+    """
+    Attributes:
+        history (:class:`History` or None) : the Function calls that created this variable or None if constant
+        derivative (variable type): the derivative with respect to this variable
+        grad (variable type) : alias for derivative, used for tensors
+        name (string) : a globally unique name of the variable
+    """
+
+    def __init__(self, history, name=None):
+        global variable_count
+        assert history is None or isinstance(history, History), history
+
+        self.history = history
+        self._derivative = None
+
+        # This is a bit simplistic, but make things easier.
+        variable_count += 1
+        self.unique_id = "Variable" + str(variable_count)
+
+        # For debugging can have a name.
+        if name is not None:
+            self.name = name
+        else:
+            self.name = self.unique_id
+        self.used = 0
+
+    def requires_grad_(self, val):
+        """
+        Set the requires_grad flag to `val` on variable.
+
+        Ensures that operations on this variable will trigger
+        backpropagation.
+
+        Args:
+            val (bool): whether to require grad
+        """
+        self.history = History()
+
+    def backward(self, d_output=None):
+        """
+        Calls autodiff to fill in the derivatives for the history of this object.
+
+        Args:
+            d_output (number, opt): starting derivative to backpropagate through the model
+                                   (typically left out, and assumed to be 1.0).
+        """
+        if d_output is None:
+            d_output = 1.0
+        backpropagate(self, d_output)
+
+    @property
+    def derivative(self):
+        return self._derivative
+
+    def is_leaf(self):
+        "True if this variable created by the user (no `last_fn`)"
+        return self.history.last_fn is None
+
+    def accumulate_derivative(self, val):
+        """
+        Add `val` to the the derivative accumulated on this variable.
+        Should only be called during autodifferentiation on leaf variables.
+
+        Args:
+            val (number): value to be accumulated
+        """
+        assert self.is_leaf(), "Only leaf variables can have derivatives."
+        if self._derivative is None:
+            self._derivative = self.zeros()
+        self._derivative += val
+
+    def zero_derivative_(self):  # pragma: no cover
+        """
+        Reset the derivative on this variable.
+        """
+        self._derivative = self.zeros()
+
+    def zero_grad_(self):  # pragma: no cover
+        """
+        Reset the derivative on this variable.
+        """
+        self.zero_derivative_()
+
+    def expand(self, x):
+        "Placeholder for tensor variables"
+        return x
+
+    # Helper functions for children classes.
+
+    def __radd__(self, b):
+        return self + b
+
+    def __rmul__(self, b):
+        return self * b
+
+    def zeros(self):
+        return 0.0
+
+
+# Some helper functions for handling optional tuples.
+
+
+def wrap_tuple(x):
+    "Turn a possible value into a tuple"
+    if isinstance(x, tuple):
+        return x
+    return (x,)
+
+
+def unwrap_tuple(x):
+    "Turn a singleton tuple into a value"
+    if len(x) == 1:
+        return x[0]
+    return x
+
+
+# Classes for Functions.
+
+
+class Context:
+    """
+    Context class is used by `Function` to store information during the forward pass.
+
+    Attributes:
+        no_grad (bool) : do not save gradient information
+        saved_values (tuple) : tuple of values saved for backward pass
+        saved_tensors (tuple) : alias for saved_values
+    """
+
+    def __init__(self, no_grad=False):
+        self._saved_values = None
+        self.no_grad = no_grad
+
+    def save_for_backward(self, *values):
+        """
+        Store the given `values` if they need to be used during backpropagation.
+
+        Args:
+            values (list of values) : values to save for backward
+        """
+        if self.no_grad:
+            return
+        self._saved_values = values
+
+    @property
+    def saved_values(self):
+        assert not self.no_grad, "Doesn't require grad"
+        assert self._saved_values is not None, "Did you forget to save values?"
+        return unwrap_tuple(self._saved_values)
+
+    @property
+    def saved_tensors(self):  # pragma: no cover
+        return self.saved_values
+
+
+class History:
+    """
+    `History` stores the history of `Function` operations that was
+    used to construct the current Variable.
+
+    Attributes:
+        last_fn (:class:`FunctionBase`) : The last Function that was called.
+        ctx (:class:`Context`): The context for that Function.
+        inputs (list of inputs) : The inputs that were given when `last_fn.forward` was called.
+
+    """
+
+    def __init__(self, last_fn=None, ctx=None, inputs=None):
+        self.last_fn = last_fn
+        self.ctx = ctx
+        self.inputs = inputs
+
+    def backprop_step(self, d_output):
+        """
+        Run one step of backpropagation by calling chain rule.
+
+        Args:
+            d_output : a derivative with respect to this variable
+
+        Returns:
+            list of numbers : a derivative with respect to `inputs`
+        """
+
+        return self.last_fn.chain_rule(self.ctx, self.inputs, d_output)
+
+
+class FunctionBase:
+    """
+    A function that can act on :class:`Variable` arguments to
+    produce a :class:`Variable` output, while tracking the internal history.
+
+    Call by :func:`FunctionBase.apply`.
+
+    """
+
+    @staticmethod
+    def variable(raw, history):
+        # Implement by children class.
+        raise NotImplementedError()
+
+    @classmethod
+    def apply(cls, *vals):
+        """
+        Apply is called by the user to run the Function.
+        Internally it does three things:
+
+        a) Creates a Context for the function call.
+        b) Calls forward to run the function.
+        c) Attaches the Context to the History of the new variable.
+
+        There is a bit of internal complexity in our implementation
+        to handle both scalars and tensors.
+
+        Args:
+            vals (list of Variables or constants) : The arguments to forward
+
+        Returns:
+            `Variable` : The new variable produced
+
+        """
+        # Go through the variables to see if any needs grad.
+        raw_vals = []
+        need_grad = False
+        for v in vals:
+            if isinstance(v, Variable):
+                if v.history is not None:
+                    need_grad = True
+                v.used += 1
+                raw_vals.append(v.get_data())
+            else:
+                raw_vals.append(v)
+
+        # Create the context.
+        ctx = Context(not need_grad)
+
+        # Call forward with the variables.
+        c = cls.forward(ctx, *raw_vals)
+        assert isinstance(c, cls.data_type), "Expected return typ %s got %s" % (
+            cls.data_type,
+            type(c),
+        )
+
+        # Create a new variable from the result with a new history.
+        back = None
+        if need_grad:
+            back = History(cls, ctx, vals)
+        return cls.variable(cls.data(c), back)
+
+    @classmethod
+    def chain_rule(cls, ctx, inputs, d_output):
+        """
+        Implement the derivative chain-rule.
+
+        Args:
+            ctx (:class:`Context`) : The context from running forward
+            inputs (list of args) : The args that were passed to :func:`FunctionBase.apply` (e.g. :math:`x, y`)
+            d_output (number) : The `d_output` value in the chain rule.
+
+        Returns:
+            list of (`Variable`, number) : A list of non-constant variables with their derivatives
+            (see `is_constant` to remove unneeded variables)
+
+        """
+        # Tip: Note when implementing this function that
+        # cls.backward may return either a value or a tuple.
+
+        d = cls.backward(ctx, d_output)
+        for i, input in enumerate(inputs):
+            if not is_constant(input):
+                yield (input, d[i] if isinstance(d, tuple) else d)
+
+
+# Algorithms for backpropagation
+
+
+def is_constant(val):
+    return not isinstance(val, Variable) or val.history is None
+
+
+def topological_sort(variable):
+    """
+    Computes the topological order of the computation graph.
+
+    Args:
+        variable (:class:`Variable`): The right-most variable
+
+    Returns:
+        list of Variables : Non-constant Variables in topological order
+                            starting from the right.
+    """
+
+    sorted = []
+    visited = set()
+
+    def visit(var):
+        if var.unique_id in visited:
+            return
+        if not var.is_leaf():
+            for input in var.history.inputs:
+                if not is_constant(input):
+                    visit(input)
+        visited.add(var.unique_id)
+        sorted.insert(0, var)
+
+    visit(variable)
+    return sorted
+
+
+def backpropagate(variable, deriv):
+    """
+    Runs backpropagation on the computation graph in order to
+    compute derivatives for the leave nodes.
+
+    See :doc:`backpropagate` for details on the algorithm.
+
+    Args:
+        variable (:class:`Variable`): The right-most variable
+        deriv (number) : Its derivative that we want to propagate backward to the leaves.
+
+    No return. Should write to its results to the derivative values of each leaf through `accumulate_derivative`.
+    """
+
+    sorted = topological_sort(variable)
+
+    d_dict = defaultdict(float)
+    d_dict[variable.unique_id] = deriv
+    for var in sorted:
+        d = d_dict[var.unique_id]
+        if not var.is_leaf():
+            for v, d_part in var.history.backprop_step(d):
+                d_dict[v.unique_id] += d_part
+        else:
+            var.accumulate_derivative(d)
+
--- a/src/cuda_conv.py
+++ b/src/cuda_conv.py
--- a/src/cuda_ops.py
+++ b/src/cuda_ops.py
@ -0,0 +1,512 @@
+# Credit (not tested): https://github.com/ethanglaser/minitorch/blob/8bfdac956d/module4/minitorch/cuda_ops.py
+
+from numba import cuda
+import numba
+from .tensor_data import (
+    to_index,
+    index_to_position,
+    TensorData,
+    broadcast_index,
+    shape_broadcast,
+    MAX_DIMS,
+)
+
+# This code will CUDA compile fast versions your tensor_data functions.
+# If you get an error, read the docs for NUMBA as to what is allowed
+# in these functions.
+
+to_index = cuda.jit(device=True)(to_index)
+index_to_position = cuda.jit(device=True)(index_to_position)
+broadcast_index = cuda.jit(device=True)(broadcast_index)
+
+THREADS_PER_BLOCK = 32
+
+
+def tensor_map(fn):
+    """
+    CUDA higher-order tensor map function. ::
+
+      fn_map = tensor_map(fn)
+      fn_map(out, ... )
+
+    Args:
+        fn: function mappings floats-to-floats to apply.
+        out (array): storage for out tensor.
+        out_shape (array): shape for out tensor.
+        out_strides (array): strides for out tensor.
+        out_size (array): size for out tensor.
+        in_storage (array): storage for in tensor.
+        in_shape (array): shape for in tensor.
+        in_strides (array): strides for in tensor.
+
+    Returns:
+        None : Fills in `out`
+    """
+
+    def _map(out, out_shape, out_strides, out_size, in_storage, in_shape, in_strides):
+        out_i = numba.cuda.blockIdx.x * THREADS_PER_BLOCK + numba.cuda.threadIdx.x
+        if out_i < out.size:
+            out_index = numba.cuda.local.array(MAX_DIMS, numba.int32)
+            in_index = numba.cuda.local.array(MAX_DIMS, numba.int32)
+            to_index(out_i, out_shape, out_index)
+            broadcast_index(out_index, out_shape, in_shape, in_index)
+            in_position = index_to_position(in_index, in_strides)
+            out_position = index_to_position(out_index, out_strides)
+            out[out_position] = fn(in_storage[in_position])
+
+    return cuda.jit()(_map)
+
+
+def map(fn):
+    # CUDA compile your kernel
+    f = tensor_map(cuda.jit(device=True)(fn))
+
+    def ret(a, out=None):
+        if out is None:
+            out = a.zeros(a.shape)
+
+        # Instantiate and run the cuda kernel.
+        threadsperblock = THREADS_PER_BLOCK
+        blockspergrid = (out.size + THREADS_PER_BLOCK - 1) // THREADS_PER_BLOCK
+        f[blockspergrid, threadsperblock](*out.tuple(), out.size, *a.tuple())
+        return out
+
+    return ret
+
+
+def tensor_zip(fn):
+    """
+    CUDA higher-order tensor zipWith (or map2) function ::
+
+      fn_zip = tensor_zip(fn)
+      fn_zip(out, ...)
+
+    Args:
+        fn: function mappings two floats to float to apply.
+        out (array): storage for `out` tensor.
+        out_shape (array): shape for `out` tensor.
+        out_strides (array): strides for `out` tensor.
+        out_size (array): size for `out` tensor.
+        a_storage (array): storage for `a` tensor.
+        a_shape (array): shape for `a` tensor.
+        a_strides (array): strides for `a` tensor.
+        b_storage (array): storage for `b` tensor.
+        b_shape (array): shape for `b` tensor.
+        b_strides (array): strides for `b` tensor.
+
+    Returns:
+        None : Fills in `out`
+    """
+
+    def _zip(
+        out,
+        out_shape,
+        out_strides,
+        out_size,
+        a_storage,
+        a_shape,
+        a_strides,
+        b_storage,
+        b_shape,
+        b_strides,
+    ):
+
+        out_i = numba.cuda.blockIdx.x * THREADS_PER_BLOCK + numba.cuda.threadIdx.x
+        if out_i < out.size:
+            out_index = numba.cuda.local.array(MAX_DIMS, numba.int32)
+            a_index = numba.cuda.local.array(MAX_DIMS, numba.int32)
+            b_index = numba.cuda.local.array(MAX_DIMS, numba.int32)
+
+            to_index(out_i, out_shape, out_index)
+            broadcast_index(out_index, out_shape, a_shape, a_index)
+            broadcast_index(out_index, out_shape, b_shape, b_index)
+            a_position, b_position = (
+                index_to_position(a_index, a_strides),
+                index_to_position(b_index, b_strides),
+            )
+            out_position = index_to_position(out_index, out_strides)
+            out[out_position] = fn(a_storage[a_position], b_storage[b_position])
+
+    return cuda.jit()(_zip)
+
+
+def zip(fn):
+    f = tensor_zip(cuda.jit(device=True)(fn))
+
+    def ret(a, b):
+        c_shape = shape_broadcast(a.shape, b.shape)
+        out = a.zeros(c_shape)
+        threadsperblock = THREADS_PER_BLOCK
+        blockspergrid = (out.size + (threadsperblock - 1)) // threadsperblock
+        f[blockspergrid, threadsperblock](
+            *out.tuple(), out.size, *a.tuple(), *b.tuple()
+        )
+        return out
+
+    return ret
+
+
+def _sum_practice(out, a, size):
+    """
+    This is a practice sum kernel to prepare for reduce.
+
+    Given an array of length :math:`n` and out of size :math:`n // blockDIM`
+    it should sum up each blockDim values into an out cell.
+
+    [a_1, a_2, ..., a_100]
+
+    |
+
+    [a_1 +...+ a_32, a_32 + ... + a_64, ... ,]
+
+    Note: Each block must do the sum using shared memory!
+
+    Args:
+        out (array): storage for `out` tensor.
+        a (array): storage for `a` tensor.
+        size (int):  length of a.
+
+    """
+    BLOCK_DIM = 32
+    local_idx = numba.cuda.threadIdx.x
+    block_idx = numba.cuda.blockIdx.x
+    shared_block = numba.cuda.shared.array(BLOCK_DIM, numba.float64)
+    offset = 1
+
+    if block_idx * THREADS_PER_BLOCK + local_idx < size:
+        shared_block[local_idx] = a[block_idx * THREADS_PER_BLOCK + local_idx]
+    else:
+        shared_block[local_idx] = 0
+
+    while offset < BLOCK_DIM:
+        numba.cuda.syncthreads()
+        if local_idx % (offset * 2) == 0:
+            shared_block[local_idx] += shared_block[local_idx + offset]
+        offset *= 2
+
+    out[block_idx] = shared_block[0]
+
+
+jit_sum_practice = cuda.jit()(_sum_practice)
+
+
+def sum_practice(a):
+    (size,) = a.shape
+    threadsperblock = THREADS_PER_BLOCK
+    blockspergrid = (size // THREADS_PER_BLOCK) + 1
+    out = TensorData([0.0 for i in range(2)], (2,))
+    out.to_cuda_()
+    jit_sum_practice[blockspergrid, threadsperblock](
+        out.tuple()[0], a._tensor._storage, size
+    )
+    return out
+
+
+def tensor_reduce(fn):
+    """
+    CUDA higher-order tensor reduce function.
+
+    Args:
+        fn: reduction function maps two floats to float.
+        out (array): storage for `out` tensor.
+        out_shape (array): shape for `out` tensor.
+        out_strides (array): strides for `out` tensor.
+        out_size (array): size for `out` tensor.
+        a_storage (array): storage for `a` tensor.
+        a_shape (array): shape for `a` tensor.
+        a_strides (array): strides for `a` tensor.
+        reduce_dim (int): dimension to reduce out
+
+    Returns:
+        None : Fills in `out`
+    """
+
+    def _reduce(
+        out,
+        out_shape,
+        out_strides,
+        out_size,
+        a_storage,
+        a_shape,
+        a_strides,
+        reduce_dim,
+        reduce_value,
+    ):
+        BLOCK_DIM = 1024
+        reduce_size = a_shape[reduce_dim]
+        local_idx = numba.cuda.threadIdx.x
+        block_idx = numba.cuda.blockIdx.x
+        shared_block = numba.cuda.shared.array(BLOCK_DIM, numba.float64)
+        offset = 1
+
+        out_index = numba.cuda.local.array(MAX_DIMS, numba.int32)
+        to_index(block_idx, out_shape, out_index)
+        out_position = index_to_position(out_index, out_strides)
+
+        if local_idx < reduce_size:
+            out_index[reduce_dim] = local_idx
+            shared_block[local_idx] = a_storage[index_to_position(out_index, a_strides)]
+        else:
+            shared_block[local_idx] = reduce_value
+
+        while offset < BLOCK_DIM:
+            numba.cuda.syncthreads()
+            if local_idx % (offset * 2) == 0:
+                shared_block[local_idx] = fn(
+                    shared_block[local_idx], shared_block[local_idx + offset]
+                )
+            offset *= 2
+
+        numba.cuda.syncthreads()
+        if local_idx == 0:
+            out[out_position] = shared_block[local_idx]
+
+    return cuda.jit()(_reduce)
+
+
+def reduce(fn, start=0.0):
+    """
+    Higher-order tensor reduce function. ::
+
+      fn_reduce = reduce(fn)
+      out = fn_reduce(a, dim)
+
+    Simple version ::
+
+        for j:
+            out[1, j] = start
+            for i:
+                out[1, j] = fn(out[1, j], a[i, j])
+
+
+    Args:
+        fn: function from two floats-to-float to apply
+        a (:class:`Tensor`): tensor to reduce over
+        dim (int): int of dim to reduce
+
+    Returns:
+        :class:`Tensor` : new tensor
+    """
+    f = tensor_reduce(cuda.jit(device=True)(fn))
+
+    def ret(a, dim):
+        out_shape = list(a.shape)
+        out_shape[dim] = (a.shape[dim] - 1) // 1024 + 1
+        out_a = a.zeros(tuple(out_shape))
+
+        threadsperblock = 1024
+        blockspergrid = out_a.size
+        f[blockspergrid, threadsperblock](
+            *out_a.tuple(), out_a.size, *a.tuple(), dim, start
+        )
+
+        return out_a
+
+    return ret
+
+
+def _mm_practice(out, a, b, size):
+    """
+    This is a practice square MM kernel to prepare for matmul.
+
+    Given a storage `out` and two storage `a` and `b`. Where we know
+    both are shape [size, size] with strides [size, 1].
+
+    Size is always < 32.
+
+    Requirements:
+
+      * All data must be first moved to shared memory.
+      * Only read each cell in `a` and `b` once.
+      * Only write to global memory once per kernel.
+
+    Compute ::
+
+    for i:
+        for j:
+             for k:
+                 out[i, j] += a[i, k] * b[k, j]
+
+    Args:
+        out (array): storage for `out` tensor.
+        a (array): storage for `a` tensor.
+        b (array): storage for `a` tensor.
+        size (int): size of the square
+
+    """
+    BLOCK_DIM = 32
+    shared_a = numba.cuda.shared.array((BLOCK_DIM, BLOCK_DIM), numba.float64)
+    shared_b = numba.cuda.shared.array((BLOCK_DIM, BLOCK_DIM), numba.float64)
+
+    y = numba.cuda.threadIdx.y
+    x = numba.cuda.threadIdx.x
+    if x < size and y < size:
+        shared_a[y, x] = a[y * size + x]
+        shared_b[y, x] = b[y * size + x]
+    else:
+        shared_a[y, x] = 0
+        shared_b[y, x] = 0
+    numba.cuda.syncthreads()
+
+    if y < size and x < size:
+        temp = 0
+        for val in range(size):
+            temp += shared_a[y, val] * shared_b[val, x]
+        out[y * size + x] = temp
+
+
+jit_mm_practice = cuda.jit()(_mm_practice)
+
+
+def mm_practice(a, b):
+
+    (size, _) = a.shape
+    threadsperblock = (THREADS_PER_BLOCK, THREADS_PER_BLOCK)
+    blockspergrid = 1
+    out = TensorData([0.0 for i in range(size * size)], (size, size))
+    out.to_cuda_()
+    jit_mm_practice[blockspergrid, threadsperblock](
+        out.tuple()[0], a._tensor._storage, b._tensor._storage, size
+    )
+    return out
+
+
+@cuda.jit()
+def tensor_matrix_multiply(
+    out,
+    out_shape,
+    out_strides,
+    out_size,
+    a_storage,
+    a_shape,
+    a_strides,
+    b_storage,
+    b_shape,
+    b_strides,
+):
+    """
+    CUDA tensor matrix multiply function.
+
+    Requirements:
+
+      * All data must be first moved to shared memory.
+      * Only read each cell in `a` and `b` once.
+      * Only write to global memory once per kernel.
+
+    Should work for any tensor shapes that broadcast as long as ::
+
+        assert a_shape[-1] == b_shape[-2]
+
+    Args:
+        out (array): storage for `out` tensor
+        out_shape (array): shape for `out` tensor
+        out_strides (array): strides for `out` tensor
+        out_size (array): size for `out` tensor.
+        a_storage (array): storage for `a` tensor
+        a_shape (array): shape for `a` tensor
+        a_strides (array): strides for `a` tensor
+        b_storage (array): storage for `b` tensor
+        b_shape (array): shape for `b` tensor
+        b_strides (array): strides for `b` tensor
+
+    Returns:
+        None : Fills in `out`
+    """
+    a_batch_stride = a_strides[0] if a_shape[0] > 1 else 0
+    b_batch_stride = b_strides[0] if b_shape[0] > 1 else 0
+    BLOCK_DIM = 32
+    shared_a = numba.cuda.shared.array((BLOCK_DIM, BLOCK_DIM), numba.float64)
+    shared_b = numba.cuda.shared.array((BLOCK_DIM, BLOCK_DIM), numba.float64)
+
+    y = numba.cuda.threadIdx.y
+    x = numba.cuda.threadIdx.x
+    block_x = numba.cuda.blockIdx.x * BLOCK_DIM
+    block_y = numba.cuda.blockIdx.y * BLOCK_DIM
+    z = numba.cuda.blockIdx.z
+
+    temp = 0
+    for block_index in range((a_shape[-1] + (BLOCK_DIM - 1)) // BLOCK_DIM):
+        block_mid = block_index * BLOCK_DIM
+        if (block_mid + x) < a_shape[-1] and (block_y + y) < a_shape[-2]:
+            shared_a[y, x] = a_storage[
+                z * a_batch_stride
+                + (block_mid + x) * a_strides[-1]
+                + (block_y + y) * a_strides[-2]
+            ]
+        else:
+            shared_a[y, x] = 0
+        if (block_x + x) < b_shape[-1] and (block_mid + y) < b_shape[-2]:
+            shared_b[y, x] = b_storage[
+                z * b_batch_stride
+                + (block_x + x) * b_strides[-1]
+                + (block_mid + y) * b_strides[-2]
+            ]
+        else:
+            shared_b[y, x] = 0
+        numba.cuda.syncthreads()
+
+        for val in range(BLOCK_DIM):
+            temp += shared_a[y, val] * shared_b[val, x]
+
+    if (block_y + y) < out_shape[-2] and (block_x + x) < out_shape[-1]:
+        out[
+            z * out_strides[0]
+            + (block_y + y) * out_strides[-2]
+            + (block_x + x) * out_strides[-1]
+        ] = temp
+
+
+def matrix_multiply(a, b):
+    """
+    Tensor matrix multiply
+    Should work for any tensor shapes that broadcast in the first n-2 dims and
+    have ::
+        assert a.shape[-1] == b.shape[-2]
+
+    Args:
+        a (:class:`Tensor`): tensor a
+        b (:class:`Tensor`): tensor b
+
+    Returns:
+        :class:`Tensor` : new tensor
+    """
+
+    # Make these always be a 3 dimensional multiply
+    both_2d = 0
+    if len(a.shape) == 2:
+        a = a.contiguous().view(1, a.shape[0], a.shape[1])
+        both_2d += 1
+    if len(b.shape) == 2:
+        b = b.contiguous().view(1, b.shape[0], b.shape[1])
+        both_2d += 1
+    both_2d = both_2d == 2
+
+    ls = list(shape_broadcast(a.shape[:-2], b.shape[:-2]))
+    ls.append(a.shape[-2])
+    ls.append(b.shape[-1])
+    assert a.shape[-1] == b.shape[-2]
+    out = a.zeros(tuple(ls))
+
+    # One block per batch, extra rows, extra col
+    blockspergrid = (
+        (out.shape[1] + (THREADS_PER_BLOCK - 1)) // THREADS_PER_BLOCK,
+        (out.shape[2] + (THREADS_PER_BLOCK - 1)) // THREADS_PER_BLOCK,
+        out.shape[0],
+    )
+    threadsperblock = (THREADS_PER_BLOCK, THREADS_PER_BLOCK, 1)
+
+    tensor_matrix_multiply[blockspergrid, threadsperblock](
+        *out.tuple(), out.size, *a.tuple(), *b.tuple()
+    )
+
+    # Undo 3d if we added it.
+    if both_2d:
+        out = out.view(out.shape[1], out.shape[2])
+    return out
+
+
+class CudaOps:
+    map = map
+    zip = zip
+    reduce = reduce
+    matrix_multiply = matrix_multiply
--- a/src/datasets.py
+++ b/src/datasets.py
@ -0,0 +1,94 @@
+from dataclasses import dataclass
+import random
+import math
+
+
+def make_pts(N):
+    X = []
+    for i in range(N):
+        x_1 = random.random()
+        x_2 = random.random()
+        X.append((x_1, x_2))
+    return X
+
+
+@dataclass
+class Graph:
+    N: int
+    X: list
+    y: list
+
+
+def simple(N):
+    X = make_pts(N)
+    y = []
+    for x_1, x_2 in X:
+        y1 = 1 if x_1 < 0.5 else 0
+        y.append(y1)
+    return Graph(N, X, y)
+
+
+def diag(N):
+    X = make_pts(N)
+    y = []
+    for x_1, x_2 in X:
+        y1 = 1 if x_1 + x_2 < 0.5 else 0
+        y.append(y1)
+    return Graph(N, X, y)
+
+
+def split(N):
+    X = make_pts(N)
+    y = []
+    for x_1, x_2 in X:
+        y1 = 1 if x_1 < 0.2 or x_1 > 0.8 else 0
+        y.append(y1)
+    return Graph(N, X, y)
+
+
+def xor(N):
+    X = make_pts(N)
+    y = []
+    for x_1, x_2 in X:
+        y1 = 1 if ((x_1 < 0.5 and x_2 > 0.5) or (x_1 > 0.5 and x_2 < 0.5)) else 0
+        y.append(y1)
+    return Graph(N, X, y)
+
+
+def circle(N):
+    X = make_pts(N)
+    y = []
+    for x_1, x_2 in X:
+        x1, x2 = (x_1 - 0.5, x_2 - 0.5)
+        y1 = 1 if x1 * x1 + x2 * x2 > 0.1 else 0
+        y.append(y1)
+    return Graph(N, X, y)
+
+
+def spiral(N):
+    def x(t):
+        return t * math.cos(t) / 20.0
+
+    def y(t):
+        return t * math.sin(t) / 20.0
+
+    X = [
+        (x(10.0 * (float(i) / (N // 2))) + 0.5, y(10.0 * (float(i) / (N // 2))) + 0.5)
+        for i in range(5 + 0, 5 + N // 2)
+    ]
+    X = X + [
+        (y(-10.0 * (float(i) / (N // 2))) + 0.5, x(-10.0 * (float(i) / (N // 2))) + 0.5)
+        for i in range(5 + 0, 5 + N // 2)
+    ]
+    y = [0] * (N // 2) + [1] * (N // 2)
+    return Graph(N, X, y)
+
+
+datasets = {
+    "Simple": simple,
+    "Diag": diag,
+    "Split": split,
+    "Xor": xor,
+    "Circle": circle,
+    "Spiral": spiral,
+}
--- a/src/fast_conv.py
+++ b/src/fast_conv.py
@ -0,0 +1,333 @@
+import numpy as np
+from .tensor_data import (
+    to_index,
+    index_to_position,
+    broadcast_index,
+    MAX_DIMS,
+)
+from .tensor_functions import Function
+from numba import njit, prange
+
+
+# This code will JIT compile fast versions your tensor_data functions.
+# If you get an error, read the docs for NUMBA as to what is allowed
+# in these functions.
+to_index = njit(inline="always")(to_index)
+index_to_position = njit(inline="always")(index_to_position)
+broadcast_index = njit(inline="always")(broadcast_index)
+
+
+@njit(parallel=True)
+def tensor_conv1d(
+    out,
+    out_shape,
+    out_strides,
+    out_size,
+    input,
+    input_shape,
+    input_strides,
+    weight,
+    weight_shape,
+    weight_strides,
+    reverse,
+):
+    """
+    1D Convolution implementation.
+
+    Given input tensor of
+
+       `batch, in_channels, width`
+
+    and weight tensor
+
+       `out_channels, in_channels, k_width`
+
+    Computes padded output of
+
+       `batch, out_channels, width`
+
+    `reverse` decides if weight is anchored left (False) or right.
+    (See diagrams)
+
+    Args:
+        out (array): storage for `out` tensor.
+        out_shape (array): shape for `out` tensor.
+        out_strides (array): strides for `out` tensor.
+        out_size (int): size of the `out` tensor.
+        input (array): storage for `input` tensor.
+        input_shape (array): shape for `input` tensor.
+        input_strides (array): strides for `input` tensor.
+        weight (array): storage for `input` tensor.
+        weight_shape (array): shape for `input` tensor.
+        weight_strides (array): strides for `input` tensor.
+        reverse (bool): anchor weight at left or right
+    """
+    batch_, out_channels, out_width = out_shape
+    batch, in_channels, width = input_shape
+    out_channels_, in_channels_, kw = weight_shape
+
+    assert (
+        batch == batch_
+        and in_channels == in_channels_
+        and out_channels == out_channels_
+    )
+    s1 = input_strides
+    s2 = weight_strides
+    for out_i in prange(out_size):
+        out_index = np.zeros(3, dtype=np.int32)
+        to_index(out_i, out_shape, out_index)
+        current_batch, current_out_channels, current_width = out_index
+        val = 0
+        for current_in_channels in prange(in_channels):
+            for current_kw in range(kw):
+                i = current_kw
+                if reverse:
+                    i = kw - current_kw - 1
+                w = weight[
+                    s2[0] * current_out_channels
+                    + s2[1] * current_in_channels
+                    + s2[2] * i
+                ]
+                inc = 0
+                if reverse:
+                    if current_width - i >= 0:
+                        inc = input[
+                            current_batch * s1[0]
+                            + current_in_channels * s1[1]
+                            + (current_width - i) * s1[2]
+                        ]
+                else:
+                    if i + current_width < width:
+                        inc = input[
+                            current_batch * s1[0]
+                            + current_in_channels * s1[1]
+                            + (i + current_width) * s1[2]
+                        ]
+                val += w * inc
+        out[
+            current_batch * out_strides[0]
+            + current_out_channels * out_strides[1]
+            + current_width * out_strides[2]
+        ] = val
+
+
+class Conv1dFun(Function):
+    @staticmethod
+    def forward(ctx, input, weight):
+        """
+        Compute a 1D Convolution
+
+        Args:
+            ctx : Context
+            input (:class:`Tensor`) : batch x in_channel x h x w
+            weight (:class:`Tensor`) : out_channel x in_channel x kh x kw
+
+        Returns:
+            (:class:`Tensor`) : batch x out_channel x h x w
+        """
+        ctx.save_for_backward(input, weight)
+        batch, in_channels, w = input.shape
+        out_channels, in_channels2, kw = weight.shape
+        assert in_channels == in_channels2
+
+        # Run convolution
+        output = input.zeros((batch, out_channels, w))
+        tensor_conv1d(
+            *output.tuple(), output.size, *input.tuple(), *weight.tuple(), False
+        )
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, weight = ctx.saved_values
+        batch, in_channels, w = input.shape
+        out_channels, in_channels, kw = weight.shape
+        grad_weight = grad_output.zeros((in_channels, out_channels, kw))
+        new_input = input.permute(1, 0, 2)
+        new_grad_output = grad_output.permute(1, 0, 2)
+        tensor_conv1d(
+            *grad_weight.tuple(),
+            grad_weight.size,
+            *new_input.tuple(),
+            *new_grad_output.tuple(),
+            False,
+        )
+        grad_weight = grad_weight.permute(1, 0, 2)
+
+        grad_input = input.zeros((batch, in_channels, w))
+        new_weight = weight.permute(1, 0, 2)
+        tensor_conv1d(
+            *grad_input.tuple(),
+            grad_input.size,
+            *grad_output.tuple(),
+            *new_weight.tuple(),
+            True,
+        )
+        return grad_input, grad_weight
+
+
+conv1d = Conv1dFun.apply
+
+
+@njit(parallel=True, fastmath=True)
+def tensor_conv2d(
+    out,
+    out_shape,
+    out_strides,
+    out_size,
+    input,
+    input_shape,
+    input_strides,
+    weight,
+    weight_shape,
+    weight_strides,
+    reverse,
+):
+    """
+    2D Convolution implementation.
+
+    Given input tensor of
+
+       `batch, in_channels, height, width`
+
+    and weight tensor
+
+       `out_channels, in_channels, k_height, k_width`
+
+    Computes padded output of
+
+       `batch, out_channels, height, width`
+
+    `Reverse` decides if weight is anchored top-left (False) or bottom-right.
+    (See diagrams)
+
+
+    Args:
+        out (array): storage for `out` tensor.
+        out_shape (array): shape for `out` tensor.
+        out_strides (array): strides for `out` tensor.
+        out_size (int): size of the `out` tensor.
+        input (array): storage for `input` tensor.
+        input_shape (array): shape for `input` tensor.
+        input_strides (array): strides for `input` tensor.
+        weight (array): storage for `input` tensor.
+        weight_shape (array): shape for `input` tensor.
+        weight_strides (array): strides for `input` tensor.
+        reverse (bool): anchor weight at top-left or bottom-right
+    """
+    batch_, out_channels, _, _ = out_shape
+    batch, in_channels, height, width = input_shape
+    out_channels_, in_channels_, kh, kw = weight_shape
+
+    assert (
+        batch == batch_
+        and in_channels == in_channels_
+        and out_channels == out_channels_
+    )
+
+    s1 = input_strides
+    s2 = weight_strides
+    # inners
+    s10, s11, s12, s13 = s1[0], s1[1], s1[2], s1[3]
+    s20, s21, s22, s23 = s2[0], s2[1], s2[2], s2[3]
+
+    for out_i in prange(out_size):
+        out_index = np.zeros(4, dtype=np.int32)
+        to_index(out_i, out_shape, out_index)
+        current_batch, current_out_channels, current_height, current_width = out_index
+        val = 0
+        for current_in_channels in prange(in_channels):
+            for current_kh in range(kh):
+                for current_kw in range(kw):
+                    i = current_kh
+                    j = current_kw
+                    if reverse:
+                        j = kw - current_kw - 1
+                        i = kh - current_kh - 1
+                    w = weight[
+                        s20 * current_out_channels
+                        + s21 * current_in_channels
+                        + s22 * i
+                        + s23 * j
+                    ]
+                    inc = 0
+                    if reverse:
+                        if current_height - i >= 0 and current_width - j >= 0:
+                            inc = input[
+                                current_batch * s10
+                                + current_in_channels * s11
+                                + (current_height - i) * s12
+                                + (current_width - j) * s13
+                            ]
+                    else:
+                        if i + current_height < height and j + current_width < width:
+                            inc = input[
+                                current_batch * s10
+                                + current_in_channels * s11
+                                + (i + current_height) * s12
+                                + (j + current_width) * s13
+                            ]
+                    val += w * inc
+        out[
+            current_batch * out_strides[0]
+            + current_out_channels * out_strides[1]
+            + current_height * out_strides[2]
+            + current_width * out_strides[3]
+        ] = val
+
+
+class Conv2dFun(Function):
+    @staticmethod
+    def forward(ctx, input, weight):
+        """
+        Compute a 2D Convolution
+
+        Args:
+            ctx : Context
+            input (:class:`Tensor`) : batch x in_channel x h x w
+            weight (:class:`Tensor`) : out_channel x in_channel x kh x kw
+
+        Returns:
+            (:class:`Tensor`) : batch x out_channel x h x w
+        """
+        ctx.save_for_backward(input, weight)
+        batch, in_channels, h, w = input.shape
+        out_channels, in_channels2, kh, kw = weight.shape
+        assert in_channels == in_channels2
+        output = input.zeros((batch, out_channels, h, w))
+        tensor_conv2d(
+            *output.tuple(), output.size, *input.tuple(), *weight.tuple(), False
+        )
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, weight = ctx.saved_values
+        batch, in_channels, h, w = input.shape
+        out_channels, in_channels, kh, kw = weight.shape
+
+        grad_weight = grad_output.zeros((in_channels, out_channels, kh, kw))
+        new_input = input.permute(1, 0, 2, 3)
+        new_grad_output = grad_output.permute(1, 0, 2, 3)
+        tensor_conv2d(
+            *grad_weight.tuple(),
+            grad_weight.size,
+            *new_input.tuple(),
+            *new_grad_output.tuple(),
+            False,
+        )
+        grad_weight = grad_weight.permute(1, 0, 2, 3)
+
+        grad_input = input.zeros((batch, in_channels, h, w))
+        new_weight = weight.permute(1, 0, 2, 3)
+        tensor_conv2d(
+            *grad_input.tuple(),
+            grad_input.size,
+            *grad_output.tuple(),
+            *new_weight.tuple(),
+            True,
+        )
+        return grad_input, grad_weight
+
+
+conv2d = Conv2dFun.apply
--- a/src/fast_ops.py
+++ b/src/fast_ops.py
@ -0,0 +1,364 @@
+import numpy as np
+from numba import njit, prange
+
+from .tensor_data import (MAX_DIMS, broadcast_index, index_to_position,
+                          shape_broadcast, to_index)
+
+# TIP: Use `NUMBA_DISABLE_JIT=1 pytest tests/ -m task3_1` to run these tests without JIT.
+
+# This code will JIT compile fast versions your tensor_data functions.
+# If you get an error, read the docs for NUMBA as to what is allowed
+# in these functions.
+to_index = njit(inline="always")(to_index)
+index_to_position = njit(inline="always")(index_to_position)
+broadcast_index = njit(inline="always")(broadcast_index)
+
+
+def tensor_map(fn):
+    """
+    NUMBA low_level tensor_map function. See `tensor_ops.py` for description.
+
+    Optimizations:
+
+        * Main loop in parallel
+        * All indices use numpy buffers
+        * When `out` and `in` are stride-aligned, avoid indexing
+
+    Args:
+        fn: function mappings floats-to-floats to apply.
+        out (array): storage for out tensor.
+        out_shape (array): shape for out tensor.
+        out_strides (array): strides for out tensor.
+        in_storage (array): storage for in tensor.
+        in_shape (array): shape for in tensor.
+        in_strides (array): strides for in tensor.
+
+    Returns:
+        None : Fills in `out`
+    """
+
+    def _map(out, out_shape, out_strides, in_storage, in_shape, in_strides):
+        in_index = np.zeros(len(in_shape), dtype=int)
+        out_index = np.zeros(len(out_shape), dtype=int)
+
+        for i in prange(len(out)):
+            to_index(i, out_shape, out_index)
+            broadcast_index(out_index, out_shape, in_shape, in_index)
+
+            x = in_storage[index_to_position(in_index, in_strides)]
+            index = index_to_position(out_index, out_strides)
+            out[index] = fn(x)
+
+    return njit(parallel=True)(_map)
+
+
+def map(fn):
+    """
+    Higher-order tensor map function ::
+
+      fn_map = map(fn)
+      fn_map(a, out)
+      out
+
+    Args:
+        fn: function from float-to-float to apply.
+        a (:class:`Tensor`): tensor to map over
+        out (:class:`Tensor`): optional, tensor data to fill in,
+               should broadcast with `a`
+
+    Returns:
+        :class:`Tensor` : new tensor
+    """
+
+    # This line JIT compiles your tensor_map
+    f = tensor_map(njit()(fn))
+
+    def ret(a, out=None):
+        if out is None:
+            out = a.zeros(a.shape)
+        f(*out.tuple(), *a.tuple())
+        return out
+
+    return ret
+
+
+def tensor_zip(fn):
+    """
+    NUMBA higher-order tensor zip function. See `tensor_ops.py` for description.
+
+
+    Optimizations:
+
+        * Main loop in parallel
+        * All indices use numpy buffers
+        * When `out`, `a`, `b` are stride-aligned, avoid indexing
+
+    Args:
+        fn: function maps two floats to float to apply.
+        out (array): storage for `out` tensor.
+        out_shape (array): shape for `out` tensor.
+        out_strides (array): strides for `out` tensor.
+        a_storage (array): storage for `a` tensor.
+        a_shape (array): shape for `a` tensor.
+        a_strides (array): strides for `a` tensor.
+        b_storage (array): storage for `b` tensor.
+        b_shape (array): shape for `b` tensor.
+        b_strides (array): strides for `b` tensor.
+
+    Returns:
+        None : Fills in `out`
+    """
+
+    def _zip(
+        out,
+        out_shape,
+        out_strides,
+        a_storage,
+        a_shape,
+        a_strides,
+        b_storage,
+        b_shape,
+        b_strides,
+    ):
+        out_index = np.zeros(len(out_shape), dtype=int)
+        a_in = np.zeros(len(a_shape), dtype=int)
+        b_in = np.zeros(len(b_shape), dtype=int)
+
+        for i in prange(len(out)):
+            to_index(i, out_shape, out_index)
+            index = index_to_position(out_index, out_strides)
+
+            broadcast_index(out_index, out_shape, a_shape, a_in)
+            a = a_storage[index_to_position(a_in, a_strides)]
+
+            broadcast_index(out_index, out_shape, b_shape, b_in)
+            b = b_storage[index_to_position(b_in, b_strides)]
+
+            out[index] = fn(a, b)
+
+    return njit(parallel=True)(_zip)
+
+
+def zip(fn):
+    """
+    Higher-order tensor zip function.
+
+      fn_zip = zip(fn)
+      c = fn_zip(a, b)
+
+    Args:
+        fn: function from two floats-to-float to apply
+        a (:class:`Tensor`): tensor to zip over
+        b (:class:`Tensor`): tensor to zip over
+
+    Returns:
+        :class:`Tensor` : new tensor data
+    """
+    f = tensor_zip(njit()(fn))
+
+    def ret(a, b):
+        c_shape = shape_broadcast(a.shape, b.shape)
+        out = a.zeros(c_shape)
+        f(*out.tuple(), *a.tuple(), *b.tuple())
+        return out
+
+    return ret
+
+
+def tensor_reduce(fn):
+    """
+    NUMBA higher-order tensor reduce function. See `tensor_ops.py` for description.
+
+    Optimizations:
+
+        * Main loop in parallel
+        * All indices use numpy buffers
+        * Inner-loop should not call any functions or write non-local variables
+
+    Args:
+        fn: reduction function mapping two floats to float.
+        out (array): storage for `out` tensor.
+        out_shape (array): shape for `out` tensor.
+        out_strides (array): strides for `out` tensor.
+        a_storage (array): storage for `a` tensor.
+        a_shape (array): shape for `a` tensor.
+        a_strides (array): strides for `a` tensor.
+        reduce_dim (int): dimension to reduce out
+
+    Returns:
+        None : Fills in `out`
+
+    """
+
+    def _reduce(out, out_shape, out_strides, a_storage, a_shape, a_strides, reduce_dim):
+        out_index = np.zeros(len(a_shape), dtype=int)
+        for i in prange(len(out)):
+            to_index(i, out_shape, out_index)
+            index = index_to_position(out_index, out_strides)
+
+            for j in range(a_shape[reduce_dim]):
+                a_index = out_index.copy()
+                a_index[reduce_dim] = j
+
+                out[index] = fn(
+                    a_storage[index_to_position(a_index, a_strides)], out[index]
+                )
+
+    return njit(parallel=True)(_reduce)
+
+
+def reduce(fn, start=0.0):
+    """
+    Higher-order tensor reduce function. ::
+
+      fn_reduce = reduce(fn)
+      out = fn_reduce(a, dim)
+
+
+    Args:
+        fn: function from two floats-to-float to apply
+        a (:class:`Tensor`): tensor to reduce over
+        dim (int): int of dim to reduce
+
+    Returns:
+        :class:`Tensor` : new tensor
+    """
+
+    f = tensor_reduce(njit()(fn))
+
+    def ret(a, dim):
+        out_shape = list(a.shape)
+        out_shape[dim] = 1
+
+        # Other values when not sum.
+        out = a.zeros(tuple(out_shape))
+        out._tensor._storage[:] = start
+
+        f(*out.tuple(), *a.tuple(), dim)
+        return out
+
+    return ret
+
+
+@njit(parallel=True, fastmath=True)
+def tensor_matrix_multiply(
+    out,
+    out_shape,
+    out_strides,
+    a_storage,
+    a_shape,
+    a_strides,
+    b_storage,
+    b_shape,
+    b_strides,
+):
+    """
+    NUMBA tensor matrix multiply function.
+
+    Should work for any tensor shapes that broadcast as long as ::
+
+        assert a_shape[-1] == b_shape[-2]
+
+    Optimizations:
+
+        * Outer loop in parallel
+        * No index buffers or function calls
+        * Inner loop should have no global writes, 1 multiply.
+
+
+    Args:
+        out (array): storage for `out` tensor
+        out_shape (array): shape for `out` tensor
+        out_strides (array): strides for `out` tensor
+        a_storage (array): storage for `a` tensor
+        a_shape (array): shape for `a` tensor
+        a_strides (array): strides for `a` tensor
+        b_storage (array): storage for `b` tensor
+        b_shape (array): shape for `b` tensor
+        b_strides (array): strides for `b` tensor
+
+    Returns:
+        None : Fills in `out`
+    """
+    a_batch_stride = a_strides[0] if a_shape[0] > 1 else 0
+    b_batch_stride = b_strides[0] if b_shape[0] > 1 else 0
+
+    assert a_shape[-1] == b_shape[-2]
+
+    for i in prange(len(out)):
+        out_0 = i // (out_shape[-1] * out_shape[-2])
+        out_1 = (i % (out_shape[-1] * out_shape[-2])) // out_shape[-1]
+        out_2 = i % out_shape[-1]
+
+        out_i = (
+            out_0 * out_strides[0] +
+            out_1 * out_strides[1] +
+            out_2 * out_strides[2]
+        )
+
+        a_start = out_0 * a_batch_stride + out_1 * a_strides[1]
+        b_start = out_0 * b_batch_stride + out_2 * a_strides[2]
+
+        temp = 0
+        for position in range(a_shape[-1]):
+            temp += (
+                a_storage[a_start + position * a_strides[2]] *
+                b_storage[b_start + position * b_strides[1]]
+            )
+        out[out_i] = temp
+
+
+
+def matrix_multiply(a, b):
+    """
+    Batched tensor matrix multiply ::
+
+        for n:
+          for i:
+            for j:
+              for k:
+                out[n, i, j] += a[n, i, k] * b[n, k, j]
+
+    Where n indicates an optional broadcasted batched dimension.
+
+    Should work for tensor shapes of 3 dims ::
+
+        assert a.shape[-1] == b.shape[-2]
+
+    Args:
+        a (:class:`Tensor`): tensor data a
+        b (:class:`Tensor`): tensor data b
+
+    Returns:
+        :class:`Tensor` : new tensor data
+    """
+
+    # Make these always be a 3 dimensional multiply
+    both_2d = 0
+    if len(a.shape) == 2:
+        a = a.contiguous().view(1, a.shape[0], a.shape[1])
+        both_2d += 1
+    if len(b.shape) == 2:
+        b = b.contiguous().view(1, b.shape[0], b.shape[1])
+        both_2d += 1
+    both_2d = both_2d == 2
+
+    ls = list(shape_broadcast(a.shape[:-2], b.shape[:-2]))
+    ls.append(a.shape[-2])
+    ls.append(b.shape[-1])
+    assert a.shape[-1] == b.shape[-2]
+    out = a.zeros(tuple(ls))
+
+    tensor_matrix_multiply(*out.tuple(), *a.tuple(), *b.tuple())
+
+    # Undo 3d if we added it.
+    if both_2d:
+        out = out.view(out.shape[1], out.shape[2])
+    return out
+
+
+class FastOps:
+    map = map
+    zip = zip
+    reduce = reduce
+    matrix_multiply = matrix_multiply
--- a/src/module.py
+++ b/src/module.py
@ -0,0 +1,162 @@
+class Module:
+    """
+    Modules form a tree that store parameters and other
+    submodules. They make up the basis of neural network stacks.
+
+    Attributes:
+        _modules (dict of name x :class:`Module`): Storage of the child modules
+        _parameters (dict of name x :class:`Parameter`): Storage of the module's parameters
+        training (bool): Whether the module is in training mode or evaluation mode
+
+    """
+
+    def __init__(self):
+        self._modules = {}
+        self._parameters = {}
+        self.training = True
+
+    def modules(self):
+        "Return the direct child modules of this module."
+        return self.__dict__["_modules"].values()
+
+    def train(self):
+        "Set the mode of this module and all descendent modules to `train`."
+
+        def _train(module):
+            module.training = True
+            for m in module.modules():
+                _train(m)
+
+        _train(self)
+
+    def eval(self):
+        "Set the mode of this module and all descendent modules to `eval`."
+
+        def _eval(module):
+            module.training = False
+            for m in module.modules():
+                _eval(m)
+
+        _eval(self)
+
+    def named_parameters(self):
+        """
+        Collect all the parameters of this module and its descendents.
+
+
+        Returns:
+            list of pairs: Contains the name and :class:`Parameter` of each ancestor parameter.
+        """
+
+        def _named_parameters(module, prefix=""):
+            for name, param in module._parameters.items():
+                yield prefix + name, param
+            for name, module in module._modules.items():
+                yield from _named_parameters(module, prefix + name + ".")
+
+        return list(_named_parameters(self))
+
+    def parameters(self):
+        "Enumerate over all the parameters of this module and its descendents."
+
+        # def _parameters(module):
+        #     for param in module._parameters.values():
+        #         yield param
+        #     for module in module._modules.values():
+        #         yield from _parameters(module)
+
+        return [param for _, param in self.named_parameters()]
+
+    def add_parameter(self, k, v):
+        """
+        Manually add a parameter. Useful helper for scalar parameters.
+
+        Args:
+            k (str): Local name of the parameter.
+            v (value): Value for the parameter.
+
+        Returns:
+            Parameter: Newly created parameter.
+        """
+        val = Parameter(v, k)
+        self.__dict__["_parameters"][k] = val
+        return val
+
+    def __setattr__(self, key, val):
+        if isinstance(val, Parameter):
+            self.__dict__["_parameters"][key] = val
+        elif isinstance(val, Module):
+            self.__dict__["_modules"][key] = val
+        else:
+            super().__setattr__(key, val)
+
+    def __getattr__(self, key):
+        if key in self.__dict__["_parameters"]:
+            return self.__dict__["_parameters"][key]
+
+        if key in self.__dict__["_modules"]:
+            return self.__dict__["_modules"][key]
+
+    def __call__(self, *args, **kwargs):
+        return self.forward(*args, **kwargs)
+
+    def forward(self):
+        assert False, "Not Implemented"
+
+    def __repr__(self):
+        def _addindent(s_, numSpaces):
+            s = s_.split("\n")
+            if len(s) == 1:
+                return s_
+            first = s.pop(0)
+            s = [(numSpaces * " ") + line for line in s]
+            s = "\n".join(s)
+            s = first + "\n" + s
+            return s
+
+        child_lines = []
+
+        for key, module in self._modules.items():
+            mod_str = repr(module)
+            mod_str = _addindent(mod_str, 2)
+            child_lines.append("(" + key + "): " + mod_str)
+        lines = child_lines
+
+        main_str = self.__class__.__name__ + "("
+        if lines:
+            # simple one-liner info, which most builtin Modules will use
+            main_str += "\n  " + "\n  ".join(lines) + "\n"
+
+        main_str += ")"
+        return main_str
+
+
+class Parameter:
+    """
+    A Parameter is a special container stored in a :class:`Module`.
+
+    It is designed to hold a :class:`Variable`, but we allow it to hold
+    any value for testing.
+    """
+
+    def __init__(self, x=None, name=None):
+        self.value = x
+        self.name = name
+        if hasattr(x, "requires_grad_"):
+            self.value.requires_grad_(True)
+            if self.name:
+                self.value.name = self.name
+
+    def update(self, x):
+        "Update the parameter value."
+        self.value = x
+        if hasattr(x, "requires_grad_"):
+            self.value.requires_grad_(True)
+            if self.name:
+                self.value.name = self.name
+
+    def __repr__(self):
+        return repr(self.value)
+
+    def __str__(self):
+        return str(self.value)
--- a/src/modules.py
+++ b/src/modules.py
@ -0,0 +1,60 @@
+# from .tensor import rand
+# from .functions import matmul, conv2d
+# from .module import Module, Parameter
+
+
+# class tLinear(Module):
+#     def __init__(self, in_size, out_size):
+#         super().__init__()
+#         self.weights = Parameter(rand((in_size, out_size)))
+#         self.bias = Parameter(rand((out_size,)))
+#         self.out_size = out_size
+
+#     def forward(self, x):
+#         batch, in_size = x.shape
+#         return (
+#             self.weights.value.view(1, in_size, self.out_size)
+#             * x.view(batch, in_size, 1)
+#         ).sum(1).view(batch, self.out_size) + self.bias.value.view(1, self.out_size)
+
+
+# class tLinear2(Module):
+#     def __init__(self, in_size, out_size):
+#         super().__init__()
+#         self.weights = Parameter(rand((in_size, out_size)))
+#         self.bias = Parameter(rand((out_size,)))
+#         self.out_size = out_size
+
+#     def forward(self, x):
+#         batch, in_size = x.shape
+#         return matmul(x, self.weights.value) + self.bias.value.view(1, self.out_size)
+
+
+# class Dropout(Module):
+#     def __init__(self, rate):
+#         super().__init__()
+#         self.rate = rate
+
+#     def forward(self, x):
+#         return (rand(x.shape) / 2 + 0.5 < self.rate) * x
+
+
+# class Conv2d(Module):
+#     def __init__(self, in_features, out_features, size):
+#         super().__init__()
+#         size1 = [size[0], size[1], in_features, out_features]
+#         size2 = [size[0], size[1], out_features]
+#         self.weights = Parameter(rand(size1))
+#         self.bias = Parameter(rand(size2))
+
+#     def forward(self, x):
+#         return conv2d(x, self.weights.value, self.bias.value)
+
+
+# # class MaxPool2d(Module):
+# #     def __init__(self, in_features, out_features, size):
+# #         super().__init__()
+
+
+# #     def forward(self, x):
+# #         return conv2d(x, self.weights.value, self.bias.value)
--- a/src/nn.py
+++ b/src/nn.py
@ -0,0 +1,175 @@
+from .fast_ops import FastOps
+from .tensor_functions import rand, Function
+from . import operators
+
+
+def tile(input, kernel):
+    """
+    Reshape an image tensor for 2D pooling
+
+    Args:
+        input (:class:`Tensor`): batch x channel x height x width
+        kernel ( pair of ints ): height x width of pooling
+
+    Returns:
+        (:class:`Tensor`, int, int) : Tensor of size batch x channel x new_height x new_width x (kernel_height * kernel_width) as well as the new_height and new_width value.
+    """
+
+    batch, channel, height, width = input.shape
+    kh, kw = kernel
+    assert height % kh == 0
+    assert width % kw == 0
+
+    new_height = int(height / kh)
+    new_width = int(width / kw)
+
+    input = (
+        input.contiguous()
+        .view(batch, channel, height, new_width, kw)
+        .permute(0, 1, 3, 2, 4)
+        .contiguous()
+        .view(batch, channel, new_width, new_height, kh * kw)
+    )
+
+    return (input, new_height, new_width)
+
+
+def avgpool2d(input, kernel):
+    """
+    Tiled average pooling 2D
+
+    Args:
+        input (:class:`Tensor`): batch x channel x height x width
+        kernel ( pair of ints ): height x width of pooling
+
+    Returns:
+        :class:`Tensor` : pooled tensor
+    """
+    batch, channel, height, width = input.shape
+    input = (
+        tile(input, kernel).mean(4).view(batch, channel, input.shape[2], input.shape[3])
+    )
+    return input
+
+
+max_reduce = FastOps.reduce(operators.max, -1e9)
+
+
+def argmax(input, dim):
+    """
+    Compute the argmax as a 1-hot tensor.
+
+    Args:
+        input (:class:`Tensor`): input tensor
+        dim (int): dimension to apply argmax
+
+
+    Returns:
+        :class:`Tensor` : tensor with 1 on highest cell in dim, 0 otherwise
+
+    """
+    out = max_reduce(input, [dim])
+    return out == input
+
+
+class Max(Function):
+    @staticmethod
+    def forward(ctx, input, dim):
+        "Forward of max should be max reduction"
+        res = max_reduce(input, [dim])
+        ctx.save_for_backward(input, dim)
+        return res
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        "Backward of max should be argmax (see above)"
+        input, dim = ctx.saved_values
+        res = argmax(input, dim)
+        return grad_output * res
+
+
+max = Max.apply
+
+
+def softmax(input, dim):
+    r"""
+    Compute the softmax as a tensor.
+
+    .. math::
+
+        z_i = \frac{e^{x_i}}{\sum_i e^{x_i}}
+
+    Args:
+        input (:class:`Tensor`): input tensor
+        dim (int): dimension to apply softmax
+
+    Returns:
+        :class:`Tensor` : softmax tensor
+    """
+
+    input = input.exp()
+    t = input.sum(dim)
+    input = input / t
+    return input
+
+
+def logsoftmax(input, dim):
+    r"""
+    Compute the log of the softmax as a tensor.
+
+    .. math::
+
+        z_i = x_i - \log \sum_i e^{x_i}
+
+    See https://en.wikipedia.org/wiki/LogSumExp#log-sum-exp_trick_for_log-domain_calculations
+
+    Args:
+        input (:class:`Tensor`): input tensor
+        dim (int): dimension to apply log-softmax
+
+    Returns:
+        :class:`Tensor` : log of softmax tensor
+    """
+
+    t = input.exp()
+    t = t.sum(dim)
+    t = t.log()
+    return input - t
+
+
+def maxpool2d(input, kernel):
+    """
+    Tiled max pooling 2D
+
+    Args:
+        input (:class:`Tensor`): batch x channel x height x width
+        kernel ( pair of ints ): height x width of pooling
+
+    Returns:
+        :class:`Tensor` : pooled tensor
+    """
+    batch, channel, height, width = input.shape
+    input = tile(input, kernel)
+    input = max(input, 4)
+    input = input.view(batch, channel, int(height / kernel[0]), int(width / kernel[1]))
+    return input
+
+
+def dropout(input, rate, ignore=False):
+    """
+    Dropout positions based on random noise.
+
+    Args:
+        input (:class:`Tensor`): input tensor
+        rate (float): probability [0, 1) of dropping out each position
+        ignore (bool): skip dropout, i.e. do nothing at all
+
+    Returns:
+        :class:`Tensor` : tensor with random positions dropped out
+    """
+    if not ignore:
+        rand_tensor = rand(input.shape)
+        rand_drop = rand_tensor > rate
+        return input * rand_drop
+    else:
+        return input
--- a/src/operators.py
+++ b/src/operators.py
@ -0,0 +1,212 @@
+"""
+Collection of the core mathematical operators used throughout the code base.
+"""
+
+from glob import glob
+import math
+
+# ## Task 0.1
+
+# Implementation of a prelude of elementary functions.
+
+
+def mul(x, y):
+    ":math:`f(x, y) = x * y`"
+    return x * y
+
+
+def id(x):
+    ":math:`f(x) = x`"
+    return x
+
+
+def add(x, y):
+    ":math:`f(x, y) = x + y`"
+    return x + y
+
+
+def neg(x):
+    ":math:`f(x) = -x`"
+    return float(-x)
+
+
+def lt(x, y):
+    ":math:`f(x) =` 1.0 if x is less than y else 0.0"
+    return 1.0 if x < y else 0.0
+
+
+def eq(x, y):
+    ":math:`f(x) =` 1.0 if x is equal to y else 0.0"
+    return 1.0 if x == y else 0.0
+
+
+def max(x, y):
+    ":math:`f(x) =` x if x is greater than y else y"
+    return x if x > y else y
+
+
+def is_close(x, y):
+    ":math:`f(x) = |x - y| < 1e-2`"
+    return abs(x - y) < 1e-2
+
+
+def sigmoid(x):
+    r"""
+    :math:`f(x) =  \frac{1.0}{(1.0 + e^{-x})}`
+
+    (See `<https://en.wikipedia.org/wiki/Sigmoid_function>`_ .)
+
+    Calculate as
+
+    :math:`f(x) =  \frac{1.0}{(1.0 + e^{-x})}` if x >=0 else :math:`\frac{e^x}{(1.0 + e^{x})}`
+
+    for stability.
+
+    Args:
+        x (float): input
+
+    Returns:
+        float : sigmoid value
+    """
+    return 1.0 / (1.0 + math.exp(-x))
+
+
+def relu(x):
+    """
+    :math:`f(x) =` x if x is greater than 0, else 0
+
+    (See `<https://en.wikipedia.org/wiki/Rectifier_(neural_networks)>`_ .)
+
+    Args:
+        x (float): input
+
+    Returns:
+        float : relu value
+    """
+    return (x > 0) * x
+
+
+EPS = 1e-6
+
+
+def log(x):
+    ":math:`f(x) = log(x)`"
+    return math.log(x + EPS)
+
+
+def exp(x):
+    ":math:`f(x) = e^{x}`"
+    return math.exp(x)
+
+
+def log_back(x, d):
+    r"If :math:`f = log` as above, compute d :math:`d \times f'(x)`"
+    return d / x
+
+
+def inv(x):
+    ":math:`f(x) = 1/x`"
+    return 1 / x
+
+
+def inv_back(x, d):
+    r"If :math:`f(x) = 1/x` compute d :math:`d \times f'(x)`"
+    return -d / x ** 2
+
+
+def relu_back(x, d):
+    r"If :math:`f = relu` compute d :math:`d \times f'(x)`"
+    return d if x > 0 else 0.0
+
+
+def sigmoid_back(x, d):
+    return d * exp(-x) / ((1 + exp(-x)) ** 2)
+
+
+# ## Task 0.3
+
+# Small library of elementary higher-order functions for practice.
+
+
+def map(fn):
+    """
+    Higher-order map.
+
+    .. image:: figs/Ops/maplist.png
+
+
+    See `<https://en.wikipedia.org/wiki/Map_(higher-order_function)>`_
+
+    Args:
+        fn (one-arg function): Function from one value to one value.
+
+    Returns:
+        function : A function that takes a list, applies `fn` to each element, and returns a
+        new list
+    """
+    return lambda list: [fn(x) for x in list]
+
+
+def negList(ls):
+    "Use :func:`map` and :func:`neg` to negate each element in `ls`"
+    return map(neg)(ls)
+
+
+def zipWith(fn):
+    """
+    Higher-order zipwith (or map2).
+
+    .. image:: figs/Ops/ziplist.png
+
+    See `<https://en.wikipedia.org/wiki/Map_(higher-order_function)>`_
+
+    Args:
+        fn (two-arg function): combine two values
+
+    Returns:
+        function : takes two equally sized lists `ls1` and `ls2`, produce a new list by
+        applying fn(x, y) on each pair of elements.
+
+    """
+    return lambda ls1, ls2: (fn(x, y) for x, y in zip(ls1, ls2))
+
+
+def addLists(ls1, ls2):
+    "Add the elements of `ls1` and `ls2` using :func:`zipWith` and :func:`add`"
+    return zipWith(add)(ls1, ls2)
+
+
+def reduce(fn, start):
+    r"""
+    Higher-order reduce.
+
+    .. image:: figs/Ops/reducelist.png
+
+
+    Args:
+        fn (two-arg function): combine two values
+        start (float): start value :math:`x_0`
+
+    Returns:
+        function : function that takes a list `ls` of elements
+        :math:`x_1 \ldots x_n` and computes the reduction :math:`fn(x_3, fn(x_2,
+        fn(x_1, x_0)))`
+    """
+
+    def _reduce(ls, fn, start):
+        iterator = iter(ls)
+        for i in iterator:
+            start = fn(start, i)
+        return start
+
+    return lambda ls: _reduce(ls, fn, start)
+
+
+def sum(ls):
+    "Sum up a list using :func:`reduce` and :func:`add`."
+    return reduce(add, 0)(ls)
+
+
+def prod(ls):
+    "Product of a list using :func:`reduce` and :func:`mul`."
+    return reduce(mul, 1)(ls)
--- a/src/optim.py
+++ b/src/optim.py
@ -0,0 +1,19 @@
+class Optimizer:
+    def __init__(self, parameters):
+        self.parameters = parameters
+
+
+class SGD(Optimizer):
+    def __init__(self, parameters, lr=1.0):
+        super().__init__(parameters)
+        self.lr = lr
+
+    def zero_grad(self):
+        for p in self.parameters:
+            if p.value.derivative is not None:
+                p.value._derivative = None
+
+    def step(self):
+        for p in self.parameters:
+            if p.value.derivative is not None:
+                p.update(p.value - self.lr * p.value.derivative)
--- a/src/scalar.py
+++ b/src/scalar.py
@ -0,0 +1,314 @@
+from .autodiff import FunctionBase, Variable, History
+from . import operators
+import numpy as np
+
+
+# ## Task 1.1
+# Central Difference calculation
+
+
+def central_difference(f, *vals, arg=0, epsilon=1e-6):
+    r"""
+    Computes an approximation to the derivative of `f` with respect to one arg.
+
+    See :doc:`derivative` or https://en.wikipedia.org/wiki/Finite_difference for more details.
+
+    Args:
+        f : arbitrary function from n-scalar args to one value
+        *vals (list of floats): n-float values :math:`x_0 \ldots x_{n-1}`
+        arg (int): the number :math:`i` of the arg to compute the derivative
+        epsilon (float): a small constant
+
+    Returns:
+        float : An approximation of :math:`f'_i(x_0, \ldots, x_{n-1})`
+    """
+
+    def _modify_arg(e):
+        return [x + e if i == arg else x for i, x in enumerate(vals)]
+
+    return (f(*_modify_arg(epsilon)) - f(*_modify_arg(-epsilon))) / (2 * epsilon)
+
+
+# ## Task 1.2 and 1.4
+# Scalar Forward and Backward
+
+
+class Scalar(Variable):
+    """
+    A reimplementation of scalar values for autodifferentiation
+    tracking.  Scalar Variables behave as close as possible to standard
+    Python numbers while also tracking the operations that led to the
+    number's creation. They can only be manipulated by
+    :class:`ScalarFunction`.
+
+    Attributes:
+        data (float): The wrapped scalar value.
+    """
+
+    def __init__(self, v, back=History(), name=None):
+        super().__init__(back, name=name)
+        self.data = float(v)
+
+    def __repr__(self):
+        return "Scalar(%f)" % self.data
+
+    def __mul__(self, b):
+        return Mul.apply(self, b)
+
+    def __truediv__(self, b):
+        return Mul.apply(self, Inv.apply(b))
+
+    def __rtruediv__(self, b):
+        return Mul.apply(b, Inv.apply(self))
+
+    def __add__(self, b):
+        return Add.apply(self, b)
+
+    def __bool__(self):
+        return bool(self.data)
+
+    def __lt__(self, b):
+        return LT.apply(self, b)
+
+    def __gt__(self, b):
+        return LT.apply(b, self)
+
+    def __eq__(self, b):
+        return EQ.apply(self, b)
+
+    def __sub__(self, b):
+        return Add.apply(self, Neg.apply(b))
+
+    def __neg__(self):
+        return Neg.apply(self)
+
+    def log(self):
+        return Log.apply(self)
+
+    def exp(self):
+        return Exp.apply(self)
+
+    def sigmoid(self):
+        return Sigmoid.apply(self)
+
+    def relu(self):
+        return ReLU.apply(self)
+
+    def get_data(self):
+        "Returns the raw float value"
+        return self.data
+
+
+class ScalarFunction(FunctionBase):
+    """
+    A wrapper for a mathematical function that processes and produces
+    Scalar variables.
+
+    This is a static class and is never instantiated. We use `class`
+    here to group together the `forward` and `backward` code.
+    """
+
+    @staticmethod
+    def forward(ctx, *inputs):
+        r"""
+        Forward call, compute :math:`f(x_0 \ldots x_{n-1})`.
+
+        Args:
+            ctx (:class:`Context`): A container object to save
+                                    any information that may be needed
+                                    for the call to backward.
+            *inputs (list of floats): n-float values :math:`x_0 \ldots x_{n-1}`.
+
+        Should return float the computation of the function :math:`f`.
+        """
+        pass  # pragma: no cover
+
+    @staticmethod
+    def backward(ctx, d_out):
+        r"""
+        Backward call, computes :math:`f'_{x_i}(x_0 \ldots x_{n-1}) \times d_{out}`.
+
+        Args:
+            ctx (Context): A container object holding any information saved during in the corresponding `forward` call.
+            d_out (float): :math:`d_out` term in the chain rule.
+
+        Should return the computation of the derivative function
+        :math:`f'_{x_i}` for each input :math:`x_i` times `d_out`.
+
+        """
+        pass  # pragma: no cover
+
+    # Checks.
+    variable = Scalar
+    data_type = float
+
+    @staticmethod
+    def data(a):
+        return a
+
+
+# Examples
+class Add(ScalarFunction):
+    "Addition function :math:`f(x, y) = x + y`"
+
+    @staticmethod
+    def forward(ctx, a, b):
+        return a + b
+
+    @staticmethod
+    def backward(ctx, d_output):
+        return d_output, d_output
+
+
+class Log(ScalarFunction):
+    "Log function :math:`f(x) = log(x)`"
+
+    @staticmethod
+    def forward(ctx, a):
+        ctx.save_for_backward(a)
+        return operators.log(a)
+
+    @staticmethod
+    def backward(ctx, d_output):
+        a = ctx.saved_values
+        return operators.log_back(a, d_output)
+
+
+# To implement.
+
+
+class Mul(ScalarFunction):
+    "Multiplication function"
+
+    @staticmethod
+    def forward(ctx, a, b):
+        ctx.save_for_backward(a, b)
+        return a * b
+
+    @staticmethod
+    def backward(ctx, d_output):
+        a, b = ctx.saved_values
+        return b * d_output, a * d_output
+
+
+class Inv(ScalarFunction):
+    "Inverse function"
+
+    @staticmethod
+    def forward(ctx, a):
+        ctx.save_for_backward(a)
+        return operators.inv(a)
+
+    @staticmethod
+    def backward(ctx, d_output):
+        a = ctx.saved_values
+        return operators.inv_back(a, d_output)
+
+
+class Neg(ScalarFunction):
+    "Negation function"
+
+    @staticmethod
+    def forward(ctx, a):
+        return operators.neg(a)
+
+    @staticmethod
+    def backward(ctx, d_output):
+        return operators.neg(d_output)
+
+
+class Sigmoid(ScalarFunction):
+    "Sigmoid function"
+
+    @staticmethod
+    def forward(ctx, a):
+        ctx.save_for_backward(a)
+        return operators.sigmoid(a)
+
+    @staticmethod
+    def backward(ctx, d_output):
+        a = ctx.saved_values
+        return operators.sigmoid_back(a, d_output)
+
+
+class ReLU(ScalarFunction):
+    "ReLU function"
+
+    @staticmethod
+    def forward(ctx, a):
+        ctx.save_for_backward(a)
+        return operators.relu(a)
+
+    @staticmethod
+    def backward(ctx, d_output):
+        a = ctx.saved_values
+        return operators.relu_back(a, d_output)
+
+
+class Exp(ScalarFunction):
+    "Exp function"
+
+    @staticmethod
+    def forward(ctx, a):
+        ctx.save_for_backward(a)
+        return operators.exp(a)
+
+    @staticmethod
+    def backward(ctx, d_output):
+        a = ctx.saved_values
+        return operators.exp(a) * d_output
+
+
+class LT(ScalarFunction):
+    "Less-than function :math:`f(x) =` 1.0 if x is less than y else 0.0"
+
+    @staticmethod
+    def forward(ctx, a, b):
+        return operators.lt(a, b)
+
+    @staticmethod
+    def backward(ctx, d_output):
+        return 0.0, 0.0
+
+
+class EQ(ScalarFunction):
+    "Equal function :math:`f(x) =` 1.0 if x is equal to y else 0.0"
+
+    @staticmethod
+    def forward(ctx, a, b):
+        return operators.eq(a, b)
+
+    @staticmethod
+    def backward(ctx, d_output):
+        return 0.0, 0.0
+
+
+def derivative_check(f, *scalars):
+    """
+    Checks that autodiff works on a python function.
+    Asserts False if derivative is incorrect.
+
+    Parameters:
+        f (function) : function from n-scalars to 1-scalar.
+        *scalars (list of :class:`Scalar`) : n input scalar values.
+    """
+    for x in scalars:
+        x.requires_grad_(True)
+    out = f(*scalars)
+    out.backward()
+
+    vals = [v for v in scalars]
+    err_msg = """
+Derivative check at arguments f(%s) and received derivative f'=%f for argument %d,
+but was expecting derivative f'=%f from central difference."""
+    for i, x in enumerate(scalars):
+        check = central_difference(f, *vals, arg=i)
+        print(str([x.data for x in scalars]), x.derivative, i, check)
+        np.testing.assert_allclose(
+            x.derivative,
+            check.data,
+            1e-2,
+            1e-2,
+            err_msg=err_msg
+            % (str([x.data for x in scalars]), x.derivative, i, check.data),
+        )
--- a/src/scalar_modules.py
+++ b/src/scalar_modules.py
--- a/src/tensor.py
+++ b/src/tensor.py
@ -0,0 +1,239 @@
+"""
+Implementation of the core Tensor object for autodifferentiation.
+"""
+
+from .autodiff import Variable
+from .tensor_data import TensorData
+from . import operators
+
+
+# This class is very similar to Scalar so we implemented it for you.
+
+
+class Tensor(Variable):
+    """
+    Tensor is a generalization of Scalar in that it is a Variable that
+    handles multidimensional arrays.
+
+    Attributes:
+
+        _tensor (:class:`TensorData`) : the tensor data storage
+        backend : backend object used to implement tensor math (see `tensor_functions.py`)
+    """
+
+    def __init__(self, v, back=None, name=None, backend=None):
+        assert isinstance(v, TensorData)
+        assert backend is not None
+        super().__init__(back, name=name)
+        self._tensor = v
+        self.backend = backend
+
+    def to_numpy(self):
+        """
+        Returns:
+             narray : converted to numpy array
+        """
+        return self.contiguous()._tensor._storage.reshape(self.shape)
+
+    # Properties
+    @property
+    def shape(self):
+        """
+        Returns:
+             tuple : shape of the tensor
+        """
+        return self._tensor.shape
+
+    @property
+    def size(self):
+        """
+        Returns:
+             int : size of the tensor
+        """
+        return self._tensor.size
+
+    @property
+    def dims(self):
+        """
+        Returns:
+             int : dimensionality of the tensor
+        """
+        return self._tensor.dims
+
+    def _ensure_tensor(self, b):
+        "Turns a python number into a tensor with the same backend."
+        if isinstance(b, (int, float)):
+            b = Tensor.make([b], (1,), backend=self.backend)
+        else:
+            b._type_(self.backend)
+        return b
+
+    # Functions
+    def __add__(self, b):
+        return self.backend.Add.apply(self, self._ensure_tensor(b))
+
+    def __sub__(self, b):
+        return self.backend.Add.apply(self, -self._ensure_tensor(b))
+
+    def __mul__(self, b):
+        return self.backend.Mul.apply(self, self._ensure_tensor(b))
+
+    def __truediv__(self, b):
+        return self.backend.Mul.apply(
+            self, self.backend.Inv.apply(self._ensure_tensor(b))
+        )
+
+    def __rtruediv__(self, b):
+        return self.backend.Mul.apply(
+            self._ensure_tensor(b), self.backend.Inv.apply(self)
+        )
+
+    def __matmul__(self, b):
+        "Not used until Module 3"
+        return self.backend.MatMul.apply(self, b)
+
+    def __lt__(self, b):
+        return self.backend.LT.apply(self, self._ensure_tensor(b))
+
+    def __eq__(self, b):
+        return self.backend.EQ.apply(self, self._ensure_tensor(b))
+
+    def __gt__(self, b):
+        return self.backend.LT.apply(self._ensure_tensor(b), self)
+
+    def __neg__(self):
+        return self.backend.Neg.apply(self)
+
+    def all(self, dim=None):
+        return self.backend.All.apply(self, dim)
+
+    def is_close(self, y):
+        return self.backend.IsClose.apply(self, y)
+
+    def sigmoid(self):
+        return self.backend.Sigmoid.apply(self)
+
+    def relu(self):
+        return self.backend.ReLU.apply(self)
+
+    def log(self):
+        return self.backend.Log.apply(self)
+
+    def item(self):
+        assert self.size == 1
+        return self[0]
+
+    def exp(self):
+        return self.backend.Exp.apply(self)
+
+    def sum(self, dim=None):
+        "Compute the sum over dimension `dim`"
+        return self.backend.Sum.apply(self, dim)
+
+    def mean(self, dim=None):
+        "Compute the mean over dimension `dim`"
+        if dim is not None:
+            return self.sum(dim) / self.shape[dim]
+        else:
+            return self.sum() / self.size
+
+    def permute(self, *order):
+        "Permute tensor dimensions to *order"
+        return self.backend.Permute.apply(self, order)
+
+    def view(self, *shape):
+        "Change the shape of the tensor to a new shape with the same size"
+        return self.backend.View.apply(self, shape)
+
+    def contiguous(self):
+        "Return a contiguous tensor with the same data"
+        return self.backend.Copy.apply(self)
+
+    def __repr__(self):
+        return self._tensor.to_string()
+
+    def __getitem__(self, key):
+        return self._tensor.get(key)
+
+    def __setitem__(self, key, val):
+        self._tensor.set(key, val)
+
+    @property
+    def grad(self):
+        return self.derivative
+
+    # Internal methods used for autodiff.
+    def _type_(self, backend):
+        self.backend = backend
+        if backend.cuda:  # pragma: no cover
+            self._tensor.to_cuda_()
+
+    def _new(self, tensor_data):
+        return Tensor(tensor_data, backend=self.backend)
+
+    @staticmethod
+    def make(storage, shape, strides=None, backend=None):
+        "Create a new tensor from data"
+        return Tensor(TensorData(storage, shape, strides), backend=backend)
+
+    def expand(self, other):
+        """
+        Method used to allow for backprop over broadcasting.
+        This method is called when the output of `backward`
+        is a different size than the input of `forward`.
+
+
+        Parameters:
+            other (class:`Tensor`): backward tensor (must broadcast with self)
+
+        Returns:
+            Expanded version of `other` with the right derivatives
+
+        """
+
+        # Case 1: Both the same shape.
+        if self.shape == other.shape:
+            return other
+
+        # Case 2: Backward is a smaller than self. Broadcast up.
+        true_shape = TensorData.shape_broadcast(self.shape, other.shape)
+        buf = self.zeros(true_shape)
+        self.backend._id_map(other, out=buf)
+        if self.shape == true_shape:
+            return buf
+
+        # Case 3: Still different, reduce extra dims.
+        out = buf
+        orig_shape = [1] * (len(out.shape) - len(self.shape)) + list(self.shape)
+        for dim, shape in enumerate(out.shape):
+            if orig_shape[dim] == 1 and shape != 1:
+                out = self.backend._add_reduce(out, dim)
+        assert out.size == self.size, f"{out.shape} {self.shape}"
+        # START CODE CHANGE (2021)
+        return Tensor.make(out._tensor._storage, self.shape, backend=self.backend)
+        # END CODE CHANGE (2021)
+
+    def zeros(self, shape=None):
+        def zero(shape):
+            return Tensor.make(
+                [0] * int(operators.prod(shape)), shape, backend=self.backend
+            )
+
+        if shape is None:
+            out = zero(self.shape)
+        else:
+            out = zero(shape)
+        out._type_(self.backend)
+        return out
+
+    def tuple(self):
+        return self._tensor.tuple()
+
+    def get_data(self):
+        return Tensor(self._tensor, backend=self.backend)
+
+    def backward(self, grad_output=None):
+        if grad_output is None:
+            assert self.shape == (1,), "Must provide grad_output if non-scalar"
+            grad_output = Tensor.make([1.0], (1,), backend=self.backend)
+        super().backward(grad_output)
--- a/src/tensor_data.py
+++ b/src/tensor_data.py
@ -0,0 +1,243 @@
+import random
+from .operators import prod
+from numpy import array, float64, ndarray
+import numba
+
+MAX_DIMS = 32
+
+
+class IndexingError(RuntimeError):
+    "Exception raised for indexing errors."
+    pass
+
+
+def index_to_position(index, strides):
+    """
+    Converts a multidimensional tensor `index` into a single-dimensional position in
+    storage based on strides.
+
+    Args:
+        index (array-like): index tuple of ints
+        strides (array-like): tensor strides
+
+    Returns:
+        int : position in storage
+    """
+
+    assert len(index) == len(strides)
+    return sum([i * s for i, s in zip(index, strides)])
+
+
+def to_index(ordinal, shape, out_index):
+    """
+    Convert an `ordinal` to an index in the `shape`.
+    Should ensure that enumerating position 0 ... size of a
+    tensor produces every index exactly once. It
+    may not be the inverse of `index_to_position`.
+
+    Args:
+        ordinal (int): ordinal position to convert.
+        shape (tuple): tensor shape.
+        out_index (array): the index corresponding to position.
+
+    Returns:
+      None : Fills in `out_index`.
+
+    """
+
+    for i, s in enumerate(shape):
+        product = prod(shape[i:])
+        divisor = product / s
+        index = int(ordinal // divisor)
+
+        ordinal -= index * divisor
+        out_index[i] = index
+
+
+def broadcast_index(big_index, big_shape, shape, out_index):
+    """
+    Convert a `big_index` into `big_shape` to a smaller `out_index`
+    into `shape` following broadcasting rules. In this case
+    it may be larger or with more dimensions than the `shape`
+    given. Additional dimensions may need to be mapped to 0 or
+    removed.
+
+    Args:
+        big_index (array-like): multidimensional index of bigger tensor
+        big_shape (array-like): tensor shape of bigger tensor
+        shape (array-like): tensor shape of smaller tensor
+        out_index (array-like): multidimensional index of smaller tensor
+
+    Returns:
+        None : Fills in `out_index`.
+    """
+
+    for i in range(len(shape)):
+        out_index[i] = big_index[len(big_shape) - len(shape) + i] if shape[i] > 1 else 0
+
+
+def shape_broadcast(shape1, shape2):
+    """
+    Broadcast two shapes to create a new union shape.
+
+    Args:
+        shape1 (tuple) : first shape
+        shape2 (tuple) : second shape
+
+    Returns:
+        tuple : broadcasted shape
+
+    Raises:
+        IndexingError : if cannot broadcast
+    """
+
+    tuple = ()
+    large, small = (shape1, shape2) if len(shape1) > len(shape2) else (shape2, shape1)
+
+    for i in range(len(large)):
+        diff = len(large) - len(small)
+        l = large[i]
+        s = l if i < diff else small[i - diff]
+        l, s = (l, s) if l > s else (s, l)
+
+        if l % s != 0:
+            raise IndexingError()
+        else:
+            tuple += (l,)
+
+    return tuple
+
+
+def strides_from_shape(shape):
+    layout = [1]
+    offset = 1
+    for s in reversed(shape):
+        layout.append(s * offset)
+        offset = s * offset
+    return tuple(reversed(layout[:-1]))
+
+
+class TensorData:
+    def __init__(self, storage, shape, strides=None):
+        if isinstance(storage, ndarray):
+            self._storage = storage
+        else:
+            self._storage = array(storage, dtype=float64)
+
+        if strides is None:
+            strides = strides_from_shape(shape)
+
+        assert isinstance(strides, tuple), "Strides must be tuple"
+        assert isinstance(shape, tuple), "Shape must be tuple"
+        if len(strides) != len(shape):
+            raise IndexingError(f"Len of strides {strides} must match {shape}.")
+        self._strides = array(strides)
+        self._shape = array(shape)
+        self.strides = strides
+        self.dims = len(strides)
+        self.size = int(prod(shape))
+        self.shape = shape
+        assert len(self._storage) == self.size
+
+    def to_cuda_(self):  # pragma: no cover
+        if not numba.cuda.is_cuda_array(self._storage):
+            self._storage = numba.cuda.to_device(self._storage)
+
+    def is_contiguous(self):
+        """
+        Check that the layout is contiguous, i.e. outer dimensions have bigger strides than inner dimensions.
+
+        Returns:
+            bool : True if contiguous
+        """
+        last = 1e9
+        for stride in self._strides:
+            if stride > last:
+                return False
+            last = stride
+        return True
+
+    @staticmethod
+    def shape_broadcast(shape_a, shape_b):
+        return shape_broadcast(shape_a, shape_b)
+
+    def index(self, index):
+        if isinstance(index, int):
+            index = array([index])
+        if isinstance(index, tuple):
+            index = array(index)
+
+        # Check for errors
+        if index.shape[0] != len(self.shape):
+            raise IndexingError(f"Index {index} must be size of {self.shape}.")
+        for i, ind in enumerate(index):
+            if ind >= self.shape[i]:
+                raise IndexingError(f"Index {index} out of range {self.shape}.")
+            if ind < 0:
+                raise IndexingError(f"Negative indexing for {index} not supported.")
+
+        # Call fast indexing.
+        return index_to_position(array(index), self._strides)
+
+    def indices(self):
+        lshape = array(self.shape)
+        out_index = array(self.shape)
+        for i in range(self.size):
+            to_index(i, lshape, out_index)
+            yield tuple(out_index)
+
+    def sample(self):
+        return tuple((random.randint(0, s - 1) for s in self.shape))
+
+    def get(self, key):
+        return self._storage[self.index(key)]
+
+    def set(self, key, val):
+        self._storage[self.index(key)] = val
+
+    def tuple(self):
+        return (self._storage, self._shape, self._strides)
+
+    def permute(self, *order):
+        """
+        Permute the dimensions of the tensor.
+
+        Args:
+            order (list): a permutation of the dimensions
+
+        Returns:
+            :class:`TensorData`: a new TensorData with the same storage and a new dimension order.
+        """
+        assert list(sorted(order)) == list(
+            range(len(self.shape))
+        ), f"Must give a position to each dimension. Shape: {self.shape} Order: {order}"
+
+        return TensorData(
+            self._storage,
+            tuple(self.shape[x] for x in order),
+            tuple(self.strides[x] for x in order),
+        )
+
+    def to_string(self):
+        s = ""
+        for index in self.indices():
+            l = ""
+            for i in range(len(index) - 1, -1, -1):
+                if index[i] == 0:
+                    l = "\n%s[" % ("\t" * i) + l
+                else:
+                    break
+            s += l
+            v = self.get(index)
+            s += f"{v:3.2f}"
+            l = ""
+            for i in range(len(index) - 1, -1, -1):
+                if index[i] == self.shape[i] - 1:
+                    l += "]"
+                else:
+                    break
+            if l:
+                s += l
+            else:
+                s += " "
+        return s
--- a/src/tensor_functions.py
+++ b/src/tensor_functions.py
@ -0,0 +1,410 @@
+"""
+Implementation of the autodifferentiation Functions for Tensor.
+"""
+
+
+from .autodiff import FunctionBase
+from .tensor_ops import TensorOps
+import numpy as np
+from . import operators
+from .tensor import Tensor
+import random
+
+
+# Constructors
+class Function(FunctionBase):
+    data_type = Tensor
+
+    @staticmethod
+    def variable(data, back):
+        return Tensor(data[0], back, backend=data[1])
+
+    @staticmethod
+    def data(a):
+        return (a._tensor, a.backend)
+
+
+def make_tensor_backend(tensor_ops, is_cuda=False):
+    """
+    Dynamically construct a tensor backend based on a `tensor_ops` object
+    that implements map, zip, and reduce higher-order functions.
+
+    Args:
+        tensor_ops (:class:`TensorOps`) : tensor operations object see `tensor_ops.py`
+        is_cuda (bool) : is the operations object CUDA / GPU based
+
+    Returns :
+        backend : a collection of tensor functions
+
+    """
+    # Maps
+    neg_map = tensor_ops.map(operators.neg)
+    sigmoid_map = tensor_ops.map(operators.sigmoid)
+    relu_map = tensor_ops.map(operators.relu)
+    log_map = tensor_ops.map(operators.log)
+    exp_map = tensor_ops.map(operators.exp)
+    id_map = tensor_ops.map(operators.id)
+    inv_map = tensor_ops.map(operators.inv)
+
+    # Zips
+    add_zip = tensor_ops.zip(operators.add)
+    mul_zip = tensor_ops.zip(operators.mul)
+    lt_zip = tensor_ops.zip(operators.lt)
+    eq_zip = tensor_ops.zip(operators.eq)
+    is_close_zip = tensor_ops.zip(operators.is_close)
+    relu_back_zip = tensor_ops.zip(operators.relu_back)
+    log_back_zip = tensor_ops.zip(operators.log_back)
+    inv_back_zip = tensor_ops.zip(operators.inv_back)
+
+    # Reduce
+    add_reduce = tensor_ops.reduce(operators.add, 0.0)
+    mul_reduce = tensor_ops.reduce(operators.mul, 1.0)
+
+    class Backend:
+        cuda = is_cuda
+        _id_map = id_map
+        _add_reduce = add_reduce
+
+        class Neg(Function):
+            @staticmethod
+            def forward(ctx, t1):
+                return neg_map(t1)
+
+            @staticmethod
+            def backward(ctx, grad_output):
+                return neg_map(grad_output)
+
+        class Inv(Function):
+            @staticmethod
+            def forward(ctx, t1):
+                ctx.save_for_backward(t1)
+                return inv_map(t1)
+
+            @staticmethod
+            def backward(ctx, grad_output):
+                t1 = ctx.saved_values
+                return inv_back_zip(t1, grad_output)
+
+        class Add(Function):
+            @staticmethod
+            def forward(ctx, t1, t2):
+                return add_zip(t1, t2)
+
+            @staticmethod
+            def backward(ctx, grad_output):
+                return grad_output, grad_output
+
+        class Mul(Function):
+            @staticmethod
+            def forward(ctx, a, b):
+                ctx.save_for_backward(a, b)
+                return mul_zip(a, b)
+
+            @staticmethod
+            def backward(ctx, grad_output):
+                a, b = ctx.saved_values
+                return b * grad_output, a * grad_output
+
+        class Sigmoid(Function):
+            @staticmethod
+            def forward(ctx, a):
+                ctx.save_for_backward(a)
+                return sigmoid_map(a)
+
+            @staticmethod
+            def backward(ctx, grad_output):
+                a = ctx.saved_values
+                return add_zip(
+                    mul_zip(grad_output, sigmoid_map(a)),
+                    neg_map(
+                        mul_zip(grad_output, mul_zip(sigmoid_map(a), sigmoid_map(a)))
+                    ),
+                )
+
+        class ReLU(Function):
+            @staticmethod
+            def forward(ctx, a):
+                ctx.save_for_backward(a)
+                return relu_map(a)
+
+            @staticmethod
+            def backward(ctx, grad_output):
+                a = ctx.saved_values
+                return relu_back_zip(a, grad_output)
+
+        class Log(Function):
+            @staticmethod
+            def forward(ctx, a):
+                ctx.save_for_backward(a)
+                return log_map(a)
+
+            @staticmethod
+            def backward(ctx, grad_output):
+                a = ctx.saved_values
+                return log_back_zip(a, grad_output)
+
+        class Exp(Function):
+            @staticmethod
+            def forward(ctx, a):
+                ctx.save_for_backward(a)
+                return exp_map(a)
+
+            @staticmethod
+            def backward(ctx, grad_output):
+                a = ctx.saved_values
+                return mul_zip(exp_map(a), grad_output)
+
+        class Sum(Function):
+            @staticmethod
+            def forward(ctx, a, dim):
+                ctx.save_for_backward(a.shape, dim)
+                if dim is not None:
+                    return add_reduce(a, dim)
+                else:
+                    return add_reduce(
+                        a.contiguous().view(int(operators.prod(a.shape))), 0
+                    )
+
+            @staticmethod
+            def backward(ctx, grad_output):
+                a_shape, dim = ctx.saved_values
+                if dim is None:
+                    out = grad_output.zeros(a_shape)
+                    out._tensor._storage[:] = grad_output[0]
+                    return out
+                else:
+                    return grad_output
+
+        class All(Function):
+            @staticmethod
+            def forward(ctx, a, dim):
+                if dim is not None:
+                    return mul_reduce(a, dim)
+                else:
+                    return mul_reduce(
+                        a.contiguous().view(int(operators.prod(a.shape))), 0
+                    )
+
+        class LT(Function):
+            @staticmethod
+            def forward(ctx, a, b):
+                return lt_zip(a, b)
+
+            @staticmethod
+            def backward(ctx, grad_output):
+                return (
+                    grad_output.zeros(grad_output.shape),
+                    grad_output.zeros(grad_output.shape),
+                )
+
+        class EQ(Function):
+            @staticmethod
+            def forward(ctx, a, b):
+                return eq_zip(a, b)
+
+            @staticmethod
+            def backward(ctx, grad_output):
+                return (
+                    grad_output.zeros(grad_output.shape),
+                    grad_output.zeros(grad_output.shape),
+                )
+
+        class IsClose(Function):
+            @staticmethod
+            def forward(ctx, a, b):
+                return is_close_zip(a, b)
+
+        class Permute(Function):
+            @staticmethod
+            def forward(ctx, a, order):
+                ctx.save_for_backward(a, order)
+                tens_store = a._tensor.permute(*order)
+                return Tensor.make(
+                    tens_store._storage,
+                    tens_store.shape,
+                    tens_store.strides,
+                    backend=a.backend,
+                )
+
+            @staticmethod
+            def backward(ctx, grad_output):
+                a, order = ctx.saved_values
+                return Tensor.make(
+                    grad_output._tensor._storage,
+                    a._tensor.shape,
+                    a._tensor.strides,
+                    backend=grad_output.backend,
+                )
+
+        class View(Function):
+            @staticmethod
+            def forward(ctx, a, shape):
+                ctx.save_for_backward(a.shape)
+                assert a._tensor.is_contiguous(), "Must be contiguous to view"
+                return Tensor.make(a._tensor._storage, shape, backend=a.backend)
+
+            @staticmethod
+            def backward(ctx, grad_output):
+                original = ctx.saved_values
+                return Tensor.make(
+                    grad_output._tensor._storage, original, backend=grad_output.backend
+                )
+
+        class Copy(Function):
+            @staticmethod
+            def forward(ctx, a):
+                return id_map(a)
+
+            @staticmethod
+            def backward(ctx, grad_output):
+                return grad_output
+
+        class MatMul(Function):
+            @staticmethod
+            def forward(ctx, t1, t2):
+                ctx.save_for_backward(t1, t2)
+                return tensor_ops.matrix_multiply(t1, t2)
+
+            @staticmethod
+            def backward(ctx, grad_output):
+                t1, t2 = ctx.saved_values
+
+                def transpose(a):
+                    order = list(range(a.dims))
+                    order[-2], order[-1] = order[-1], order[-2]
+                    return a._new(a._tensor.permute(*order))
+
+                return (
+                    tensor_ops.matrix_multiply(grad_output, transpose(t2)),
+                    tensor_ops.matrix_multiply(transpose(t1), grad_output),
+                )
+
+    return Backend
+
+
+TensorFunctions = make_tensor_backend(TensorOps)
+
+
+# Helpers for Constructing tensors
+def zeros(shape, backend=TensorFunctions):
+    """
+    Produce a zero tensor of size `shape`.
+
+    Args:
+        shape (tuple): shape of tensor
+        backend (:class:`Backend`): tensor backend
+
+    Returns:
+        :class:`Tensor` : new tensor
+    """
+    return Tensor.make([0] * int(operators.prod(shape)), shape, backend=backend)
+
+
+def rand(shape, backend=TensorFunctions, requires_grad=False):
+    """
+    Produce a random tensor of size `shape`.
+
+    Args:
+        shape (tuple): shape of tensor
+        backend (:class:`Backend`): tensor backend
+        requires_grad (bool): turn on autodifferentiation
+
+    Returns:
+        :class:`Tensor` : new tensor
+    """
+    vals = [random.random() for _ in range(int(operators.prod(shape)))]
+    tensor = Tensor.make(vals, shape, backend=backend)
+    tensor.requires_grad_(requires_grad)
+    return tensor
+
+
+def _tensor(ls, shape=None, backend=TensorFunctions, requires_grad=False):
+    """
+    Produce a tensor with data ls and shape `shape`.
+
+    Args:
+        ls (list): data for tensor
+        shape (tuple): shape of tensor
+        backend (:class:`Backend`): tensor backend
+        requires_grad (bool): turn on autodifferentiation
+
+    Returns:
+        :class:`Tensor` : new tensor
+    """
+    tensor = Tensor.make(ls, shape, backend=backend)
+    tensor.requires_grad_(requires_grad)
+    return tensor
+
+
+def tensor(ls, backend=TensorFunctions, requires_grad=False):
+    """
+    Produce a tensor with data and shape from ls
+
+    Args:
+        ls (list): data for tensor
+        backend (:class:`Backend`): tensor backend
+        requires_grad (bool): turn on autodifferentiation
+
+    Returns:
+        :class:`Tensor` : new tensor
+    """
+
+    def shape(ls):
+        if isinstance(ls, (list, tuple)):
+            return [len(ls)] + shape(ls[0])
+        else:
+            return []
+
+    def flatten(ls):
+        if isinstance(ls, (list, tuple)):
+            return [y for x in ls for y in flatten(x)]
+        else:
+            return [ls]
+
+    cur = flatten(ls)
+    shape = shape(ls)
+    return _tensor(cur, tuple(shape), backend=backend, requires_grad=requires_grad)
+
+
+# Gradient check for tensors
+
+
+def grad_central_difference(f, *vals, arg=0, epsilon=1e-6, ind=None):
+    x = vals[arg]
+    up = zeros(x.shape)
+    up[ind] = epsilon
+    vals1 = [x if j != arg else x + up for j, x in enumerate(vals)]
+    vals2 = [x if j != arg else x - up for j, x in enumerate(vals)]
+    delta = f(*vals1).sum() - f(*vals2).sum()
+
+    return delta[0] / (2.0 * epsilon)
+
+
+def grad_check(f, *vals):
+    for x in vals:
+        x.requires_grad_(True)
+        x.zero_grad_()
+    random.seed(10)
+    out = f(*vals)
+    out.sum().backward()
+    err_msg = """
+
+Gradient check error for function %s.
+
+Input %s
+
+Received derivative %f for argument %d and index %s,
+but was expecting derivative %f from central difference.
+
+"""
+
+    for i, x in enumerate(vals):
+        ind = x._tensor.sample()
+        check = grad_central_difference(f, *vals, arg=i, ind=ind)
+        np.testing.assert_allclose(
+            x.grad[ind],
+            check,
+            1e-2,
+            1e-2,
+            err_msg=err_msg % (f, vals, x.grad[ind], i, ind, check),
+        )
--- a/src/tensor_ops.py
+++ b/src/tensor_ops.py
@ -0,0 +1,284 @@
+import numpy as np
+from .tensor_data import (
+    to_index,
+    index_to_position,
+    broadcast_index,
+    shape_broadcast,
+    MAX_DIMS,
+)
+
+
+def tensor_map(fn):
+    """
+    Low-level implementation of tensor map between
+    tensors with *possibly different strides*.
+
+    Simple version:
+
+    * Fill in the `out` array by applying `fn` to each
+      value of `in_storage` assuming `out_shape` and `in_shape`
+      are the same size.
+
+    Broadcasted version:
+
+    * Fill in the `out` array by applying `fn` to each
+      value of `in_storage` assuming `out_shape` and `in_shape`
+      broadcast. (`in_shape` must be smaller than `out_shape`).
+
+    Args:
+        fn: function from float-to-float to apply
+        out (array): storage for out tensor
+        out_shape (array): shape for out tensor
+        out_strides (array): strides for out tensor
+        in_storage (array): storage for in tensor
+        in_shape (array): shape for in tensor
+        in_strides (array): strides for in tensor
+
+    Returns:
+        None : Fills in `out`
+    """
+
+    def _map(out, out_shape, out_strides, in_storage, in_shape, in_strides):
+        in_index = np.zeros(len(in_shape), dtype=int)
+        out_index = np.zeros(len(out_shape), dtype=int)
+
+        for i in range(len(out)):
+            to_index(i, out_shape, out_index)
+            broadcast_index(out_index, out_shape, in_shape, in_index)
+
+            x = in_storage[index_to_position(in_index, in_strides)]
+            index = index_to_position(out_index, out_strides)
+            out[index] = fn(x)
+
+    return _map
+
+
+def map(fn):
+    """
+    Higher-order tensor map function ::
+
+      fn_map = map(fn)
+      fn_map(a, out)
+      out
+
+    Simple version::
+
+        for i:
+            for j:
+                out[i, j] = fn(a[i, j])
+
+    Broadcasted version (`a` might be smaller than `out`) ::
+
+        for i:
+            for j:
+                out[i, j] = fn(a[i, 0])
+
+    Args:
+        fn: function from float-to-float to apply.
+        a (:class:`TensorData`): tensor to map over
+        out (:class:`TensorData`): optional, tensor data to fill in,
+               should broadcast with `a`
+
+    Returns:
+        :class:`TensorData` : new tensor data
+    """
+
+    f = tensor_map(fn)
+
+    def ret(a, out=None):
+        if out is None:
+            out = a.zeros(a.shape)
+        f(*out.tuple(), *a.tuple())
+        return out
+
+    return ret
+
+
+def tensor_zip(fn):
+    """
+    Low-level implementation of tensor zip between
+    tensors with *possibly different strides*.
+
+    Simple version:
+
+    * Fill in the `out` array by applying `fn` to each
+      value of `a_storage` and `b_storage` assuming `out_shape`
+      and `a_shape` are the same size.
+
+    Broadcasted version:
+
+    * Fill in the `out` array by applying `fn` to each
+      value of `a_storage` and `b_storage` assuming `a_shape`
+      and `b_shape` broadcast to `out_shape`.
+
+    Args:
+        fn: function mapping two floats to float to apply
+        out (array): storage for `out` tensor
+        out_shape (array): shape for `out` tensor
+        out_strides (array): strides for `out` tensor
+        a_storage (array): storage for `a` tensor
+        a_shape (array): shape for `a` tensor
+        a_strides (array): strides for `a` tensor
+        b_storage (array): storage for `b` tensor
+        b_shape (array): shape for `b` tensor
+        b_strides (array): strides for `b` tensor
+
+    Returns:
+        None : Fills in `out`
+    """
+
+    def _zip(
+        out,
+        out_shape,
+        out_strides,
+        a_storage,
+        a_shape,
+        a_strides,
+        b_storage,
+        b_shape,
+        b_strides,
+    ):
+        out_index = np.zeros(len(out_shape), dtype=int)
+        a_in = np.zeros(len(a_shape), dtype=int)
+        b_in = np.zeros(len(b_shape), dtype=int)
+
+        for i in range(len(out)):
+            to_index(i, out_shape, out_index)
+            index = index_to_position(out_index, out_strides)
+
+            broadcast_index(out_index, out_shape, a_shape, a_in)
+            a = a_storage[index_to_position(a_in, a_strides)]
+
+            broadcast_index(out_index, out_shape, b_shape, b_in)
+            b = b_storage[index_to_position(b_in, b_strides)]
+
+            out[index] = fn(a, b)
+
+    return _zip
+
+
+def zip(fn):
+    """
+    Higher-order tensor zip function ::
+
+      fn_zip = zip(fn)
+      out = fn_zip(a, b)
+
+    Simple version ::
+
+        for i:
+            for j:
+                out[i, j] = fn(a[i, j], b[i, j])
+
+    Broadcasted version (`a` and `b` might be smaller than `out`) ::
+
+        for i:
+            for j:
+                out[i, j] = fn(a[i, 0], b[0, j])
+
+
+    Args:
+        fn: function from two floats-to-float to apply
+        a (:class:`TensorData`): tensor to zip over
+        b (:class:`TensorData`): tensor to zip over
+
+    Returns:
+        :class:`TensorData` : new tensor data
+    """
+
+    f = tensor_zip(fn)
+
+    def ret(a, b):
+        if a.shape != b.shape:
+            c_shape = shape_broadcast(a.shape, b.shape)
+        else:
+            c_shape = a.shape
+        out = a.zeros(c_shape)
+        f(*out.tuple(), *a.tuple(), *b.tuple())
+        return out
+
+    return ret
+
+
+def tensor_reduce(fn):
+    """
+    Low-level implementation of tensor reduce.
+
+    * `out_shape` will be the same as `a_shape`
+       except with `reduce_dim` turned to size `1`
+
+    Args:
+        fn: reduction function mapping two floats to float
+        out (array): storage for `out` tensor
+        out_shape (array): shape for `out` tensor
+        out_strides (array): strides for `out` tensor
+        a_storage (array): storage for `a` tensor
+        a_shape (array): shape for `a` tensor
+        a_strides (array): strides for `a` tensor
+        reduce_dim (int): dimension to reduce out
+
+    Returns:
+        None : Fills in `out`
+    """
+
+    def _reduce(out, out_shape, out_strides, a_storage, a_shape, a_strides, reduce_dim):
+        out_index = np.zeros(len(a_shape), dtype=int)
+        for i in range(len(out)):
+            to_index(i, out_shape, out_index)
+            index = index_to_position(out_index, out_strides)
+
+            for j in range(a_shape[reduce_dim]):
+                a_index = out_index.copy()
+                a_index[reduce_dim] = j
+
+                out[index] = fn(
+                    a_storage[index_to_position(a_index, a_strides)], out[index]
+                )
+
+    return _reduce
+
+
+def reduce(fn, start=0.0):
+    """
+    Higher-order tensor reduce function. ::
+
+      fn_reduce = reduce(fn)
+      out = fn_reduce(a, dim)
+
+    Simple version ::
+
+        for j:
+            out[1, j] = start
+            for i:
+                out[1, j] = fn(out[1, j], a[i, j])
+
+
+    Args:
+        fn: function from two floats-to-float to apply
+        a (:class:`TensorData`): tensor to reduce over
+        dim (int): int of dim to reduce
+
+    Returns:
+        :class:`TensorData` : new tensor
+    """
+
+    f = tensor_reduce(fn)
+
+    def ret(a, dim):
+        out_shape = list(a.shape)
+        out_shape[dim] = 1
+
+        # Other values when not sum.
+        out = a.zeros(tuple(out_shape))
+        out._tensor._storage[:] = start
+
+        f(*out.tuple(), *a.tuple(), dim)
+        return out
+
+    return ret
+
+
+class TensorOps:
+    map = map
+    zip = zip
+    reduce = reduce
--- a/src/testing.py
+++ b/src/testing.py
@ -0,0 +1,192 @@
+import minitorch.operators as operators
+
+
+class MathTest:
+    @staticmethod
+    def neg(a):
+        "Negate the argument"
+        return -a
+
+    @staticmethod
+    def addConstant(a):
+        "Add contant to the argument"
+        return 5 + a
+
+    @staticmethod
+    def square(a):
+        "Manual square"
+        return a * a
+
+    @staticmethod
+    def cube(a):
+        "Manual cube"
+        return a * a * a
+
+    @staticmethod
+    def subConstant(a):
+        "Subtract a constant from the argument"
+        return a - 5
+
+    @staticmethod
+    def multConstant(a):
+        "Multiply a constant to the argument"
+        return 5 * a
+
+    @staticmethod
+    def div(a):
+        "Divide by a constant"
+        return a / 5
+
+    @staticmethod
+    def inv(a):
+        "Invert after adding"
+        return operators.inv(a + 3.5)
+
+    @staticmethod
+    def sig(a):
+        "Apply sigmoid"
+        return operators.sigmoid(a)
+
+    @staticmethod
+    def log(a):
+        "Apply log to a large value"
+        return operators.log(a + 100000)
+
+    @staticmethod
+    def relu(a):
+        "Apply relu"
+        return operators.relu(a + 5.5)
+
+    @staticmethod
+    def exp(a):
+        "Apply exp to a smaller value"
+        return operators.exp(a - 200)
+
+    @staticmethod
+    def explog(a):
+        return operators.log(a + 100000) + operators.exp(a - 200)
+
+    @staticmethod
+    def add2(a, b):
+        "Add two arguments"
+        return a + b
+
+    @staticmethod
+    def mul2(a, b):
+        "Mul two arguments"
+        return a * b
+
+    @staticmethod
+    def div2(a, b):
+        "Divide two arguments"
+        return a / (b + 5.5)
+
+    @staticmethod
+    def gt2(a, b):
+        return operators.lt(b, a + 1.2)
+
+    @staticmethod
+    def lt2(a, b):
+        return operators.lt(a + 1.2, b)
+
+    @staticmethod
+    def eq2(a, b):
+        return operators.eq(a, (b + 5.5))
+
+    @staticmethod
+    def sum_red(a):
+        return operators.sum(a)
+
+    @staticmethod
+    def mean_red(a):
+        return operators.sum(a) / float(len(a))
+
+    @staticmethod
+    def mean_full_red(a):
+        return operators.sum(a) / float(len(a))
+
+    @staticmethod
+    def complex(a):
+        return (
+            operators.log(
+                operators.sigmoid(
+                    operators.relu(operators.relu(a * 10 + 7) * 6 + 5) * 10
+                )
+            )
+            / 50
+        )
+
+    @classmethod
+    def _tests(cls):
+        """
+        Returns a list of all the math tests.
+        """
+        one_arg = []
+        two_arg = []
+        red_arg = []
+        for k in dir(MathTest):
+            if callable(getattr(MathTest, k)) and not k.startswith("_"):
+                base_fn = getattr(MathTest, k)
+                scalar_fn = getattr(cls, k)
+                tup = (k, base_fn, scalar_fn)
+                if k.endswith("2"):
+                    two_arg.append(tup)
+                elif k.endswith("red"):
+                    red_arg.append(tup)
+                else:
+                    one_arg.append(tup)
+        return one_arg, two_arg, red_arg
+
+
+class MathTestVariable(MathTest):
+    @staticmethod
+    def inv(a):
+        return 1.0 / (a + 3.5)
+
+    @staticmethod
+    def sig(x):
+        return x.sigmoid()
+
+    @staticmethod
+    def log(x):
+        return (x + 100000).log()
+
+    @staticmethod
+    def relu(x):
+        return (x + 5.5).relu()
+
+    @staticmethod
+    def exp(a):
+        return (a - 200).exp()
+
+    @staticmethod
+    def explog(a):
+        return (a + 100000).log() + (a - 200).exp()
+
+    @staticmethod
+    def sum_red(a):
+        return a.sum(0)
+
+    @staticmethod
+    def mean_red(a):
+        return a.mean(0)
+
+    @staticmethod
+    def mean_full_red(a):
+        return a.mean()
+
+    @staticmethod
+    def eq2(a, b):
+        return a == (b + 5.5)
+
+    @staticmethod
+    def gt2(a, b):
+        return a + 1.2 > b
+
+    @staticmethod
+    def lt2(a, b):
+        return a + 1.2 < b
+
+    @staticmethod
+    def complex(a):
+        return (((a * 10 + 7).relu() * 6 + 5).relu() * 10).sigmoid().log() / 50