import math
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline


def f(x):
    return 3*x**2 - 4*x + 5


f(3.0)

20.0


xs = np.arange(-5, 5, 0.25)
ys = f(xs)
plt.plot(xs, ys)

[<matplotlib.lines.Line2D at 0x7f1647b75ac0>]


from IPython.display import display, Math

derivative = r"\frac{d}{dx} f(x) = \lim_{h \to 0} \frac{f(x + h) - f(x)}{h}"
display(Math(derivative))


h = 0.000001
x = 2/3
(f(x + h) - f(x))/h

2.999378523327323e-06


def d(a, b, c):
    return a*b + c


h = 0.0001

# inputs
a = 2.0
b = -3.0
c = 10.0

d1 = d(a, b, c)

# find the derivative of d function wrt c by increasing c by a tiny amount
c += h

d2 = d(a, b, c)

print('d1', d1)
print('d2', d2)
print('slope', (d2 - d1)/h)

d1 4.0
d2 4.0001
slope 0.9999999999976694


h = 0.0001

# inputs
a = 2.0
b = -3.0
c = 10.0

d1 = d(a, b, c)

# find the derivative of d function wrt b by increasing b by a tiny amount
b += h

d2 = d(a, b, c)

print('d1', d1)
print('d2', d2)
print('slope', (d2 - d1)/h)

d1 4.0
d2 4.0002
slope 2.0000000000042206


class Value:
    def __init__(self, data, _children=(), _op='', label=''):
        self.data = data
        self.grad = 0.0 # derivative of l with respect to the value
        self._backward = lambda: None
        self._prev = set(_children)
        self._op = _op
        self.label = label

    def __repr__(self):
        return f"Value(data={self.data})"
    
    def __add__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data + other.data, (self, other), '+')

        def _backward():
            self.grad += out.grad
            other.grad += out.grad
        out._backward = _backward

        return out
    
    def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data * other.data, (self, other), '*')

        def _backward():
            self.grad += other.data * out.grad
            other.grad += self.data * out.grad
        out._backward = _backward

        return out

    def __pow__(self, other):
        assert isinstance(other, (int, float)), "only supporting int/float powers for now"
        out = Value(self.data**other, (self,), f'**{other}')

        def _backward():
            self.grad += (other * self.data**(other-1)) * out.grad
        out._backward = _backward

        return out

    def tanh(self):
        x = self.data
        t = (math.exp(2 * x) - 1) / (math.exp(2 * x) + 1)
        out = Value(t, (self,), 'tanh')

        def _backward():
            self.grad += (1 - t ** 2) * out.grad

        out._backward = _backward

        return out
    
    def relu(self):
        out = Value(0 if self.data < 0 else self.data, (self,), 'ReLU')

        def _backward():
            self.grad += (out.data > 0) * out.grad
        out._backward = _backward

        return out

    
    def tanh(self):
        x = self.data
        t = (math.exp(2 * x) - 1) / (math.exp(2 * x) + 1)
        out = Value(t, (self,), 'tanh')

        def _backward():
            self.grad += (1 - t ** 2) * out.grad

        out._backward = _backward

        return out
    
    def exp(self):
        x = self.data
        out = Value(math.exp(x), (self, ), 'exp')
        
        def _backward():
            self.grad += out.data * out.grad # NOTE: in the video I incorrectly used = instead of +=. Fixed here.
        
        out._backward = _backward
        return out

    def backward(self):
        # topological order all of the children in the graph
        topo = []
        visited = set()
        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)
        build_topo(self)

        # go one variable at a time and apply the chain rule to get its gradient
        self.grad = 1
        for v in reversed(topo):
            v._backward()

    def __neg__(self): # -self
        return self * -1

    def __radd__(self, other): # other + self
        return self + other

    def __sub__(self, other): # self - other
        return self + (-other)

    def __rsub__(self, other): # other - self
        return other + (-self)

    def __rmul__(self, other): # other * self
        return self * other

    def __truediv__(self, other): # self / other
        return self * other**-1

    def __rtruediv__(self, other): # other / self
        return other * self**-1


a = Value(2.0, label='a')
b = Value(-3.0, label='b')
c = Value(10.0, label='c')
e = a*b; e.label = 'e'
d = e + c; d.label = 'd'
f = Value(-2.0, label='f')
L = d * f; L.label = 'L'
L

Value(data=-8.0)


from graphviz import Digraph


def trace(root):
    # builds a set of all nodes and edges in a graph
    nodes, edges = set(), set()

    def build(v):
        if v not in nodes:
            nodes.add(v)
            for child in v._prev:
                edges.add((child, v))
                build(child)

    build(root)
    return nodes, edges


def draw_dot(root):
    dot = Digraph(format='svg', graph_attr={'rankdir': 'LR'})  # LR = left to right

    nodes, edges = trace(root)
    for n in nodes:
        uid = str(id(n))
        # for any value in the graph, create a rectangular ('record') node for it
        dot.node(name=uid, label="{ %s | data %.4f | grad %.4f }" % (n.label, n.data, n.grad), shape='record')
        if n._op:
            # if this value is a result of some operation, create an op node for it
            dot.node(name=uid + n._op, label=n._op)
            # and connect this node to it
            dot.edge(uid + n._op, uid)

    for n1, n2 in edges:
        # connect n1 to the op node of n2
        dot.edge(str(id(n1)), str(id(n2)) + n2._op)

    return dot


draw_dot(L)


def lol():
    h = 0.001

    a = Value(2.0, label='a')
    b = Value(-3.0, label='b')
    c = Value(10.0, label='c')
    e = a * b;
    e.label = 'e'
    d = e + c;
    d.label = 'd'
    f = Value(-2.0, label='f')
    L = d * f;
    L.label = 'L'
    L1 = L.data

    a = Value(2.0, label='a')
    b = Value(-3.0, label='b')
    b.data += h
    c = Value(10.0, label='c')
    e = a * b;
    e.label = 'e'
    d = e + c;
    d.label = 'd'
    f = Value(-2.0, label='f')
    L = d * f;
    L.label = 'L'
    L2 = L.data

    print((L2 - L1) / h)


lol()

-3.9999999999995595


display(Math(r"L = d*f"))
display(Math(r"\frac {dL} {dL} = 1"))
display(Math(r"\frac {dL} {df} = \frac {(d+h)*f - (d*f)} {h} = d"))
display(Math(r"\frac {dL} {dd} = f"))


L.grad = 1.0
f.grad = d.data
d.grad = f.data


display(Math(r"d = e+c"))
display(Math(r"\frac {dd} {dc} = \frac {(c+h) + e - (c+e)} {h} = 1.0"))
display(Math(r"\frac {dL} {de} = 1.0"))


display(Math(r"\frac {dz} {dx} = \frac {dz} {dy} * \frac {dy} {dx}"))


display(Math(r"\frac {dL} {dc} = \frac {dL} {dd} * \frac {dd} {dc}"))


c.grad = d.grad * 1
e.grad = d.grad * 1


display(Math(r"e = a * b"))
display(Math(r"\frac {dL} {da} = \frac {dL} {de} * \frac {de} {da}"))
display(Math(r"\frac {dL} {db} = \frac {dL} {de} * \frac {de} {db}"))


a.grad = e.grad * b.data
b.grad = e.grad * a.data


a.data += 0.01 * a.grad
b.data += 0.01 * b.grad
c.data += 0.01 * c.grad
f.data += 0.01 * f.grad

# run the forward pass again
e = a * b
d = e + c
L = d * f


from IPython import display
display.Image("https://cs231n.github.io/assets/nn1/neuron_model.jpeg")


plt.plot(np.arange(-5,5,0.2), np.tanh(np.arange(-5,5,0.2))); plt.grid();


# inputs x1,x2
x1 = Value(2.0, label='x1')
x2 = Value(0.0, label='x2')
# weights w1,w2
w1 = Value(-3.0, label='w1')
w2 = Value(1.0, label='w2')
# bias of the neuron
b = Value(6.8813735870195432, label='b')
# x1*w1 + x2*w2 + b
x1w1 = x1*w1; x1w1.label = 'x1*w1'
x2w2 = x2*w2; x2w2.label = 'x2*w2'
x1w1x2w2 = x1w1 + x2w2; x1w1x2w2.label = 'x1*w1 + x2*w2'
n = x1w1x2w2 + b; n.label = 'n'
o = n.tanh(); o.label = 'o'


draw_dot(o)


display(Math(r"o = tanh(n)"))
display(Math(r"\frac {do} {dn} = 1 - tanh(n)^2"))
display(Math(r"\frac {do} {dn} = 1 - o^2"))


o.backward()


# inputs x1,x2
x1 = Value(2.0, label='x1')
x2 = Value(0.0, label='x2')
# weights w1,w2
w1 = Value(-3.0, label='w1')
w2 = Value(1.0, label='w2')
# bias of the neuron
b = Value(6.8813735870195432, label='b')
# x1*w1 + x2*w2 + b
x1w1 = x1*w1; x1w1.label = 'x1*w1'
x2w2 = x2*w2; x2w2.label = 'x2*w2'
x1w1x2w2 = x1w1 + x2w2; x1w1x2w2.label = 'x1*w1 + x2*w2'
n = x1w1x2w2 + b; n.label = 'n'
# ----
e = (2*n).exp()
o = (e - 1) / (e + 1)
# ----
o.label = 'o'


o.backward()


draw_dot(o)


import torch

x1 = torch.Tensor([2.0]).double()                ; x1.requires_grad = True
x2 = torch.Tensor([0.0]).double()                ; x2.requires_grad = True
w1 = torch.Tensor([-3.0]).double()               ; w1.requires_grad = True
w2 = torch.Tensor([1.0]).double()                ; w2.requires_grad = True
b = torch.Tensor([6.8813735870195432]).double()  ; b.requires_grad = True
n = x1*w1 + x2*w2 + b
o = torch.tanh(n)

print(o.data.item())
o.backward()

print('---')
print('x2', x2.grad.item())
print('w2', w2.grad.item())
print('x1', x1.grad.item())
print('w1', w1.grad.item())

0.7071066904050358
---
x2 0.5000001283844369
w2 0.0
x1 -1.5000003851533106
w1 1.0000002567688737


class Module:

    def zero_grad(self):
        for p in self.parameters():
            p.grad = 0

    def parameters(self):
        return []

class Neuron(Module):

    def __init__(self, nin, nonlin=True):  # nin is the number of inputs to this neuron, our example has three inputs
        self.w = [Value(random.uniform(-1,1)) for _ in range(nin)]  # w is the weights for these inputs
        self.b = Value(0)  #  bias controls the overall trigger happiness of this neuron
        self.nonlin = nonlin

    def __call__(self, x):
        act = sum((wi*xi for wi,xi in zip(self.w, x)), self.b)
        return act.relu() if self.nonlin else act

    def parameters(self):
        return self.w + [self.b]

    def __repr__(self):
        return f"{'ReLU' if self.nonlin else 'Linear'}Neuron({len(self.w)})"

class Layer(Module):  # Layer is just a list of neurons

    def __init__(self, nin, nout, **kwargs):  # nout is the number of neurons in a Layer
        self.neurons = [Neuron(nin, **kwargs) for _ in range(nout)]

    def __call__(self, x):
        out = [n(x) for n in self.neurons]
        return out[0] if len(out) == 1 else out

    def parameters(self):
        return [p for n in self.neurons for p in n.parameters()]

    def __repr__(self):
        return f"Layer of [{', '.join(str(n) for n in self.neurons)}]"

class MLP(Module):  # Layer feed into an MLP sequentially

    def __init__(self, nin, nouts):  # nouts is a list of the sizes of all the Layers that we want in our MLP
        sz = [nin] + nouts
        self.layers = [Layer(sz[i], sz[i+1], nonlin=i!=len(nouts)-1) for i in range(len(nouts))]

    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

    def parameters(self):
        return [p for layer in self.layers for p in layer.parameters()]

    def __repr__(self):
        return f"MLP of [{', '.join(str(layer) for layer in self.layers)}]"


x = [2.0, 3.0, -1.0]
n = MLP(3, [4, 4, 1])
n(x)

Value(data=0.5175318980563276)


draw_dot(n(x))


xs = [
  [2.0, 3.0, -1.0],
  [3.0, -1.0, 0.5],
  [0.5, 1.0, 1.0],
  [1.0, 1.0, -1.0],
]
ys = [1.0, -1.0, -1.0, 1.0] # desired targets


ypred = [n(x) for x in xs]
ypred

[Value(data=0.5175318980563276),
 Value(data=0.5035516656057866),
 Value(data=0.1992187635752545),
 Value(data=0.19200404755000194)]


loss = sum((yout - ygt) ** 2 for ygt, yout in zip(ys, ypred))
loss

Value(data=4.584426182625607)


loss.backward()


draw_dot(loss)


num_cycles = 20

for k in range(num_cycles):
    # forward pass
    ypred = [n(x) for x in xs]
    loss = sum((yout - ygt) ** 2 for ygt, yout in zip(ys, ypred))
    # backward pass
    for p in n.parameters():
        # one of the most common bug while developing neural nets
        # we need to make sure that we reset these gradients to zero so that when we do backward all of them start at zero and the actual backward pass accumulates them
        p.grad = 0.0

the loss derivatives into the grads
    loss.backward()
    # update
    for p in n.parameters():
        p.data += -0.01 * p.grad
    print(k, loss.data)

0 0.11832011527709294
1 0.11417681052240584
2 0.11027143730563282
3 0.106269781481659
4 0.10244766346102194
5 0.09865353670187882
6 0.09492791181761162
7 0.09134004461101422
8 0.08772623971249013
9 0.08433593022783775
10 0.08088983621120385
11 0.07761585022930426
12 0.07438070404238775
13 0.07123154090030628
14 0.06820558197668769
15 0.06518795943070937
16 0.06236848949480526
17 0.05951865866702462
18 0.05683920451412121
19 0.054193312325887685


ypred

[Value(data=1.1280609047338137),
 Value(data=-0.9414863019548612),
 Value(data=-0.9727950652871139),
 Value(data=0.8166158249112135)]

Import Dependencies¶

A Simple Function¶

Plot `f` For Different Values Of `x`¶

Finding The Derivative ¶

Les Get More Complex¶

Neural Network Node¶

Graph Visualization Of Neural Network¶

Manual Backpropagation¶

Try With PyTorch Tensors (Single Element Tensors)¶

Implement Neuron, Layer, Multi Layer Perceptron¶

Import Dependencies¶

A Simple Function¶

Plot f For Different Values Of x¶

Finding The Derivative¶

Les Get More Complex¶

Neural Network Node¶

Graph Visualization Of Neural Network¶

Manual Backpropagation¶

Try With PyTorch Tensors (Single Element Tensors)¶

Implement Neuron, Layer, Multi Layer Perceptron¶

Plot `f` For Different Values Of `x`¶

Finding The Derivative ¶