Question about OOP

143 · 2019 年12 月 21 日 00:07

Can I put all var, vectors into a class of torch (nn.Module)?
Does ti.classkernel work the same as an out kernel?
Is there something like ti.classfunc?

yuanming · 2019 年12 月 21 日 02:14

Take a look at this: https://taichi.readthedocs.io/en/latest/odop.html

Yes
Yes, with an exception when you want to explicitly call its gradient version: it is X.method(__gradient=True) instead of X.method.grad() if you use ti.Tape when you don’t need to worry about this.
You can just use ti.func and pass in self as a parameter.

yuanming · 2019 年12 月 23 日 18:00

Update: starting v0.3.7, you don’t need to worry about (2). Simply decorate the class with @ti.data_oriented and call A.forward.grad().

143 · 2019 年12 月 23 日 21:47

I tried @ti.data_oriented on a torch nn.Module class and it gives an error:

TypeError: super(type, obj): obj must be an instance or subtype of type

143 · 2019 年12 月 23 日 21:51

And I cannot run the previous program. It says

Runtime initialized.
{}
{}
Traceback (most recent call last):
{'__gradient': True}
  File "D:/reflex/src/learnable_p2g.py", line 146, in <module>
    test = gradcheck(linear, (pc, data), eps=1e-3, atol=1e-3)
  File "C:\Users\49446\AppData\Local\conda\conda\envs\py36\lib\site-packages\torch\autograd\gradcheck.py", line 279, in gradcheck
    analytical, reentrant, correct_grad_sizes = get_analytical_jacobian(tupled_inputs, o, nondet_tol=nondet_tol)
  File "C:\Users\49446\AppData\Local\conda\conda\envs\py36\lib\site-packages\torch\autograd\gradcheck.py", line 155, in get_analytical_jacobian
    retain_graph=True, allow_unused=True)
  File "C:\Users\49446\AppData\Local\conda\conda\envs\py36\lib\site-packages\torch\autograd\__init__.py", line 157, in grad
    inputs, allow_unused)
  File "C:\Users\49446\AppData\Local\conda\conda\envs\py36\lib\site-packages\torch\autograd\function.py", line 77, in apply
    return self._forward_cls.backward(self, *args)
  File "D:/reflex/src/learnable_p2g.py", line 29, in backward
    ctx.interp_kernel(__gradient=True)
  File "C:\Users\49446\AppData\Local\conda\conda\envs\py36\lib\site-packages\taichi\lang\kernel.py", line 359, in decorated
    primal(*args, **kwargs)
  File "C:\Users\49446\AppData\Local\conda\conda\envs\py36\lib\site-packages\taichi\lang\kernel.py", line 308, in __call__
    assert len(kwargs) == 0, 'kwargs not supported for Taichi kernels'
AssertionError: kwargs not supported for Taichi kernels

Process finished with exit code 1

yuanming · 2019 年12 月 23 日 22:01

Please see the change log: https://github.com/yuanming-hu/taichi#updates

Also could you share with me a script to reproduce the conflict between @ti.data_oriented and nn.Module? Thanks!

143 · 2019 年12 月 23 日 22:06

import torch
from torch.autograd import gradcheck
import torch.nn.functional as F
from torch.autograd import gradcheck
import math
import torch.nn as nn
import taichi as ti

ti.get_runtime().set_default_fp(ti.f32)
real = ti.f32

ti.data_oriented
class LinearFunction(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input_data, weight_0, bias_0, weight_1, bias_1,
                ti_data, ti_weight_0, ti_bias_0, ti_weight_1, ti_bias_1, ti_output_0, ti_output_1, ti_kernel):
        ctx.ti_output_1 = ti_output_1
        ctx.ti_kernel = ti_kernel
        ctx.ti_data = ti_data
        ctx.ti_weight_0 = ti_weight_0
        ctx.ti_bias_0 = ti_bias_0
        ctx.ti_weight_1 = ti_weight_1
        ctx.ti_bias_1 = ti_bias_1
        ti_data.from_torch(input_data)
        ti_weight_0.from_torch(weight_0)
        ti_bias_0.from_torch(bias_0)
        ti_weight_1.from_torch(weight_1)
        ti_bias_1.from_torch(bias_1)
        ti_kernel()
        return ti_output_1.to_torch()

    @staticmethod
    def backward(ctx, grad_output_1):
        ti.clear_all_gradients()
        grad_input_data = grad_weight_0 = grad_bias_0 = grad_weight_1 = grad_bias_1 = None
        ctx.ti_output_1.grad.from_torch(grad_output_1)
        ctx.ti_kernel.grad()

        if ctx.needs_input_grad[0]:
            grad_input_data = ctx.ti_data.grad.to_torch()
        if ctx.needs_input_grad[1]:
            grad_weight_0 = ctx.ti_weight_0.grad.to_torch()
        if ctx.needs_input_grad[2]:
            grad_bias_0 = ctx.ti_bias_0.grad.to_torch()
        if ctx.needs_input_grad[3]:
            grad_weight_1 = ctx.ti_weight_1.grad.to_torch()
        if ctx.needs_input_grad[4]:
            grad_bias_1 = ctx.ti_bias_1.grad.to_torch()

        return grad_input_data, grad_weight_0, grad_bias_0, grad_weight_1, grad_bias_1,\
               None, None, None, None, None, None, None, None

ti.data_oriented
class Linear(nn.Module):
    def __init__(self, input_feature, hidden_feature, output_feature):
        super(Linear, self).__init__()
        # taichi parameter holders
        self.ti_data = ti.var(dt=real, shape=(batch_size, input_feature), needs_grad=True)
        self.ti_weight_0 = ti.var(dt=real, shape=(input_feature, hidden_feature), needs_grad=True)
        self.ti_bias_0 = ti.var(dt=real, shape=hidden_feature, needs_grad=True)
        self.ti_output_0 = ti.var(dt=real, shape=(batch_size, hidden_feature), needs_grad=True)
        self.ti_weight_1 = ti.var(dt=real, shape=(hidden_feature, out_feature), needs_grad=True)
        self.ti_bias_1 = ti.var(dt=real, shape=out_feature, needs_grad=True)
        self.ti_output_1 = ti.var(dt=real, shape=(batch_size, out_feature), needs_grad=True)
        # torch parameters
        self.weight_0 = nn.Parameter(torch.Tensor(input_feature, hidden_feature))
        self.bias_0 = nn.Parameter(torch.Tensor(hidden_feature))
        self.weight_1 = nn.Parameter(torch.Tensor(hidden_feature, out_feature))
        self.bias_1 = nn.Parameter(torch.Tensor(out_feature))
        self.weight_0.data.normal_(0, math.sqrt(2. / hidden_feature / input_feature))
        self.weight_1.data.normal_(0, math.sqrt(2. / hidden_feature / output_feature))

    @ti.classkernel
    def linear_kernel(self):
        for i in range(batch_size):
            for j in ti.static(range(hidden_feature)):
                dummy = 0.0
                for k in range(input_feature):
                    dummy += self.ti_data[i, k] * self.ti_weight_0[k, j]
                dummy += self.ti_bias_0[j]
                self.ti_output_0[i, j] = ti.max(dummy, 0)
            for j in ti.static(range(out_feature)):
                dummy = 0.0
                for k in range(hidden_feature):
                    dummy += self.ti_output_0[i, k] * self.ti_weight_1[k, j]
                dummy += self.ti_bias_1[j]
                self.ti_output_1[i, j] = dummy

    def forward(self, input_data):
        return LinearFunction.apply(input_data, self.weight_0, self.bias_0, self.weight_1, self.bias_1,
                                    self.ti_data, self.ti_weight_0, self.ti_bias_0, self.ti_weight_1, self.ti_bias_1,
                                    self.ti_output_0, self.ti_output_1, self.linear_kernel)


if __name__ == '__main__':
    batch_size = 1
    input_feature = 4
    hidden_feature = 8
    out_feature = 2
    data = torch.rand(batch_size, input_feature, dtype=torch.float32, requires_grad=True)
    linear = Linear(input_feature, hidden_feature, out_feature)

    test = gradcheck(linear, data, eps=1e-3, atol=1e-4)
    print(test)

yuanming · 2019 年12 月 23 日 22:18

Thanks!. I’m looking into the incompatibility. Two issues in this script:

In python3 you can use super() without arguments
You need a @ before the decorator

143 · 2019 年12 月 23 日 22:26

Oh, I forgot the @! However, it gives another error

    ctx.ti_kernel.grad()
AttributeError: 'function' object has no attribute 'grad'

Maybe the issue is the function that needs grad is not a class method?

yuanming · 2019 年12 月 23 日 22:27

Yeah sorry about that. nn.Module has some other issues with @ti.data_oriented. I fixed that in v0.3.8 and the new version will be released in 20 minutes.

yuanming · 2019 年12 月 23 日 22:48

It may take 10 more minutes for v0.3.8 to build: https://ci.appveyor.com/project/IteratorAdvance/taichi/builds/29730350

Btw the way you are writing the NN does not really scale to larger networks. Here’s a scalable version:

import torch
from torch.autograd import gradcheck
import torch.nn.functional as F
from torch.autograd import gradcheck
import math
import torch.nn as nn
import taichi as ti

ti.get_runtime().set_default_fp(ti.f32)
real = ti.f32

class LinearFunction(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input_data, weight_0, bias_0, weight_1, bias_1,
                ti_data, ti_weight_0, ti_bias_0, ti_weight_1, ti_bias_1, ti_output_0, ti_output_1, ti_fc_0, ti_fc_1, ti_kernel):
        ctx.ti_output_1 = ti_output_1
        ctx.ti_kernel = ti_kernel
        ctx.ti_data = ti_data
        ctx.ti_weight_0 = ti_weight_0
        ctx.ti_bias_0 = ti_bias_0
        ctx.ti_weight_1 = ti_weight_1
        ctx.ti_bias_1 = ti_bias_1
        ctx.ti_fc_0 = ti_fc_0
        ctx.ti_fc_1 = ti_fc_1
        ti_data.from_torch(input_data)
        ti_weight_0.from_torch(weight_0)
        ti_bias_0.from_torch(bias_0)
        ti_weight_1.from_torch(weight_1)
        ti_bias_1.from_torch(bias_1)
        ti_fc_0.fill(0)
        ti_fc_1.fill(0)
        ti_kernel()
        return ti_output_1.to_torch()

    @staticmethod
    def backward(ctx, grad_output_1):
        ti.clear_all_gradients()
        grad_input_data = grad_weight_0 = grad_bias_0 = grad_weight_1 = grad_bias_1 = None
        ctx.ti_output_1.grad.from_torch(grad_output_1)
        ctx.ti_kernel.grad()

        if ctx.needs_input_grad[0]:
            grad_input_data = ctx.ti_data.grad.to_torch()
        if ctx.needs_input_grad[1]:
            grad_weight_0 = ctx.ti_weight_0.grad.to_torch()
        if ctx.needs_input_grad[2]:
            grad_bias_0 = ctx.ti_bias_0.grad.to_torch()
        if ctx.needs_input_grad[3]:
            grad_weight_1 = ctx.ti_weight_1.grad.to_torch()
        if ctx.needs_input_grad[4]:
            grad_bias_1 = ctx.ti_bias_1.grad.to_torch()

        return grad_input_data, grad_weight_0, grad_bias_0, grad_weight_1, grad_bias_1,\
               None, None, None, None, None, None, None, None, None, None

@ti.data_oriented
class Linear(nn.Module):
    def __init__(self, input_feature, hidden_feature, output_feature):
        super().__init__()
        # taichi parameter holders
        self.ti_data = ti.var(dt=real, shape=(batch_size, input_feature), needs_grad=True)
        self.ti_weight_0 = ti.var(dt=real, shape=(input_feature, hidden_feature), needs_grad=True)
        self.ti_bias_0 = ti.var(dt=real, shape=hidden_feature, needs_grad=True)
        self.ti_fc_0 = ti.var(dt=real, shape=(batch_size, hidden_feature), needs_grad=True)
        self.ti_output_0 = ti.var(dt=real, shape=(batch_size, hidden_feature), needs_grad=True)
        self.ti_weight_1 = ti.var(dt=real, shape=(hidden_feature, out_feature), needs_grad=True)
        self.ti_bias_1 = ti.var(dt=real, shape=out_feature, needs_grad=True)
        self.ti_fc_1 = ti.var(dt=real, shape=(batch_size, out_feature), needs_grad=True)
        self.ti_output_1 = ti.var(dt=real, shape=(batch_size, out_feature), needs_grad=True)
        # torch parameters
        self.weight_0 = nn.Parameter(torch.Tensor(input_feature, hidden_feature))
        self.bias_0 = nn.Parameter(torch.Tensor(hidden_feature))
        self.weight_1 = nn.Parameter(torch.Tensor(hidden_feature, out_feature))
        self.bias_1 = nn.Parameter(torch.Tensor(out_feature))
        self.weight_0.data.normal_(0, math.sqrt(2. / hidden_feature / input_feature))
        self.weight_1.data.normal_(0, math.sqrt(2. / hidden_feature / output_feature))

    @ti.classkernel
    def linear_kernel(self):
        for i in range(batch_size):
            for j in range(hidden_feature):
                for k in range(input_feature):
                    self.ti_fc_0[i, j] += self.ti_data[i, k] * self.ti_weight_0[k, j]

        for i in range(batch_size):
            for j in range(hidden_feature):
                self.ti_output_0[i, j] = ti.max(self.ti_fc_0[i, j] + self.ti_bias_0[j], 0)

        for i in range(batch_size):
            for j in range(out_feature):
                for k in range(hidden_feature):
                    self.ti_fc_1[i, j] += self.ti_output_0[i, k] * self.ti_weight_1[k, j]

        for i in range(batch_size):
            for j in range(out_feature):
                self.ti_output_1[i, j] = self.ti_fc_1[i, j] + self.ti_bias_1[j]

    def forward(self, input_data):
        return LinearFunction.apply(input_data, self.weight_0, self.bias_0, self.weight_1, self.bias_1,
                                    self.ti_data, self.ti_weight_0, self.ti_bias_0, self.ti_weight_1, self.ti_bias_1,
                                    self.ti_output_0, self.ti_output_1, self.ti_fc_0, self.ti_fc_1, self.linear_kernel)


if __name__ == '__main__':
    batch_size = 4
    input_feature = 32
    hidden_feature = 64
    out_feature = 128
    data = torch.rand(batch_size, input_feature, dtype=torch.float32, requires_grad=True)
    linear = Linear(input_feature, hidden_feature, out_feature)

    test = gradcheck(linear, data, eps=1e-3, atol=1e-3)
    print(test)

yuanming · 2019 年12 月 23 日 23:02

One thing to keep in mind: differentiating mutable local variables is really tricky (and you need messy ti.static to make it work, which doesn’t scale). Accumulate to global variables instead.

yuanming · 2019 年12 月 24 日 00:44

Please also check out the kernel simplicity rule: https://taichi.readthedocs.io/en/latest/differentiable_programming.html

143 · 2019 年12 月 24 日 04:00

I just want to confirm two things:

Can there be multiple outer loops?
Once I use ti.static, it does not count as a loop?

yuanming · 2019 年12 月 24 日 04:01

Good questions.

Yes.
With ti.static the for loop will be unrolled, therefore it’s no longer a loop.