Question about OOP

  1. Can I put all var, vectors into a class of torch (nn.Module)?
  2. Does ti.classkernel work the same as an out kernel?
  3. Is there something like ti.classfunc?

Take a look at this: https://taichi.readthedocs.io/en/latest/odop.html

  1. Yes
  2. Yes, with an exception when you want to explicitly call its gradient version: it is X.method(__gradient=True) instead of X.method.grad() if you use ti.Tape when you don’t need to worry about this.
  3. You can just use ti.func and pass in self as a parameter.

Update: starting v0.3.7, you don’t need to worry about (2). Simply decorate the class with @ti.data_oriented and call A.forward.grad().

I tried @ti.data_oriented on a torch nn.Module class and it gives an error:

TypeError: super(type, obj): obj must be an instance or subtype of type

And I cannot run the previous program. It says

Runtime initialized.
{}
{}
Traceback (most recent call last):
{'__gradient': True}
  File "D:/reflex/src/learnable_p2g.py", line 146, in <module>
    test = gradcheck(linear, (pc, data), eps=1e-3, atol=1e-3)
  File "C:\Users\49446\AppData\Local\conda\conda\envs\py36\lib\site-packages\torch\autograd\gradcheck.py", line 279, in gradcheck
    analytical, reentrant, correct_grad_sizes = get_analytical_jacobian(tupled_inputs, o, nondet_tol=nondet_tol)
  File "C:\Users\49446\AppData\Local\conda\conda\envs\py36\lib\site-packages\torch\autograd\gradcheck.py", line 155, in get_analytical_jacobian
    retain_graph=True, allow_unused=True)
  File "C:\Users\49446\AppData\Local\conda\conda\envs\py36\lib\site-packages\torch\autograd\__init__.py", line 157, in grad
    inputs, allow_unused)
  File "C:\Users\49446\AppData\Local\conda\conda\envs\py36\lib\site-packages\torch\autograd\function.py", line 77, in apply
    return self._forward_cls.backward(self, *args)
  File "D:/reflex/src/learnable_p2g.py", line 29, in backward
    ctx.interp_kernel(__gradient=True)
  File "C:\Users\49446\AppData\Local\conda\conda\envs\py36\lib\site-packages\taichi\lang\kernel.py", line 359, in decorated
    primal(*args, **kwargs)
  File "C:\Users\49446\AppData\Local\conda\conda\envs\py36\lib\site-packages\taichi\lang\kernel.py", line 308, in __call__
    assert len(kwargs) == 0, 'kwargs not supported for Taichi kernels'
AssertionError: kwargs not supported for Taichi kernels

Process finished with exit code 1

Please see the change log: https://github.com/yuanming-hu/taichi#updates

Also could you share with me a script to reproduce the conflict between @ti.data_oriented and nn.Module? Thanks!

import torch
from torch.autograd import gradcheck
import torch.nn.functional as F
from torch.autograd import gradcheck
import math
import torch.nn as nn
import taichi as ti

ti.get_runtime().set_default_fp(ti.f32)
real = ti.f32

ti.data_oriented
class LinearFunction(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input_data, weight_0, bias_0, weight_1, bias_1,
                ti_data, ti_weight_0, ti_bias_0, ti_weight_1, ti_bias_1, ti_output_0, ti_output_1, ti_kernel):
        ctx.ti_output_1 = ti_output_1
        ctx.ti_kernel = ti_kernel
        ctx.ti_data = ti_data
        ctx.ti_weight_0 = ti_weight_0
        ctx.ti_bias_0 = ti_bias_0
        ctx.ti_weight_1 = ti_weight_1
        ctx.ti_bias_1 = ti_bias_1
        ti_data.from_torch(input_data)
        ti_weight_0.from_torch(weight_0)
        ti_bias_0.from_torch(bias_0)
        ti_weight_1.from_torch(weight_1)
        ti_bias_1.from_torch(bias_1)
        ti_kernel()
        return ti_output_1.to_torch()

    @staticmethod
    def backward(ctx, grad_output_1):
        ti.clear_all_gradients()
        grad_input_data = grad_weight_0 = grad_bias_0 = grad_weight_1 = grad_bias_1 = None
        ctx.ti_output_1.grad.from_torch(grad_output_1)
        ctx.ti_kernel.grad()

        if ctx.needs_input_grad[0]:
            grad_input_data = ctx.ti_data.grad.to_torch()
        if ctx.needs_input_grad[1]:
            grad_weight_0 = ctx.ti_weight_0.grad.to_torch()
        if ctx.needs_input_grad[2]:
            grad_bias_0 = ctx.ti_bias_0.grad.to_torch()
        if ctx.needs_input_grad[3]:
            grad_weight_1 = ctx.ti_weight_1.grad.to_torch()
        if ctx.needs_input_grad[4]:
            grad_bias_1 = ctx.ti_bias_1.grad.to_torch()

        return grad_input_data, grad_weight_0, grad_bias_0, grad_weight_1, grad_bias_1,\
               None, None, None, None, None, None, None, None

ti.data_oriented
class Linear(nn.Module):
    def __init__(self, input_feature, hidden_feature, output_feature):
        super(Linear, self).__init__()
        # taichi parameter holders
        self.ti_data = ti.var(dt=real, shape=(batch_size, input_feature), needs_grad=True)
        self.ti_weight_0 = ti.var(dt=real, shape=(input_feature, hidden_feature), needs_grad=True)
        self.ti_bias_0 = ti.var(dt=real, shape=hidden_feature, needs_grad=True)
        self.ti_output_0 = ti.var(dt=real, shape=(batch_size, hidden_feature), needs_grad=True)
        self.ti_weight_1 = ti.var(dt=real, shape=(hidden_feature, out_feature), needs_grad=True)
        self.ti_bias_1 = ti.var(dt=real, shape=out_feature, needs_grad=True)
        self.ti_output_1 = ti.var(dt=real, shape=(batch_size, out_feature), needs_grad=True)
        # torch parameters
        self.weight_0 = nn.Parameter(torch.Tensor(input_feature, hidden_feature))
        self.bias_0 = nn.Parameter(torch.Tensor(hidden_feature))
        self.weight_1 = nn.Parameter(torch.Tensor(hidden_feature, out_feature))
        self.bias_1 = nn.Parameter(torch.Tensor(out_feature))
        self.weight_0.data.normal_(0, math.sqrt(2. / hidden_feature / input_feature))
        self.weight_1.data.normal_(0, math.sqrt(2. / hidden_feature / output_feature))

    @ti.classkernel
    def linear_kernel(self):
        for i in range(batch_size):
            for j in ti.static(range(hidden_feature)):
                dummy = 0.0
                for k in range(input_feature):
                    dummy += self.ti_data[i, k] * self.ti_weight_0[k, j]
                dummy += self.ti_bias_0[j]
                self.ti_output_0[i, j] = ti.max(dummy, 0)
            for j in ti.static(range(out_feature)):
                dummy = 0.0
                for k in range(hidden_feature):
                    dummy += self.ti_output_0[i, k] * self.ti_weight_1[k, j]
                dummy += self.ti_bias_1[j]
                self.ti_output_1[i, j] = dummy

    def forward(self, input_data):
        return LinearFunction.apply(input_data, self.weight_0, self.bias_0, self.weight_1, self.bias_1,
                                    self.ti_data, self.ti_weight_0, self.ti_bias_0, self.ti_weight_1, self.ti_bias_1,
                                    self.ti_output_0, self.ti_output_1, self.linear_kernel)


if __name__ == '__main__':
    batch_size = 1
    input_feature = 4
    hidden_feature = 8
    out_feature = 2
    data = torch.rand(batch_size, input_feature, dtype=torch.float32, requires_grad=True)
    linear = Linear(input_feature, hidden_feature, out_feature)

    test = gradcheck(linear, data, eps=1e-3, atol=1e-4)
    print(test)

Thanks!. I’m looking into the incompatibility. Two issues in this script:

  • In python3 you can use super() without arguments
  • You need a @ before the decorator

Oh, I forgot the @! However, it gives another error

    ctx.ti_kernel.grad()
AttributeError: 'function' object has no attribute 'grad'

Maybe the issue is the function that needs grad is not a class method?

Yeah sorry about that. nn.Module has some other issues with @ti.data_oriented. I fixed that in v0.3.8 and the new version will be released in 20 minutes.

It may take 10 more minutes for v0.3.8 to build: https://ci.appveyor.com/project/IteratorAdvance/taichi/builds/29730350

Btw the way you are writing the NN does not really scale to larger networks. Here’s a scalable version:

import torch
from torch.autograd import gradcheck
import torch.nn.functional as F
from torch.autograd import gradcheck
import math
import torch.nn as nn
import taichi as ti

ti.get_runtime().set_default_fp(ti.f32)
real = ti.f32

class LinearFunction(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input_data, weight_0, bias_0, weight_1, bias_1,
                ti_data, ti_weight_0, ti_bias_0, ti_weight_1, ti_bias_1, ti_output_0, ti_output_1, ti_fc_0, ti_fc_1, ti_kernel):
        ctx.ti_output_1 = ti_output_1
        ctx.ti_kernel = ti_kernel
        ctx.ti_data = ti_data
        ctx.ti_weight_0 = ti_weight_0
        ctx.ti_bias_0 = ti_bias_0
        ctx.ti_weight_1 = ti_weight_1
        ctx.ti_bias_1 = ti_bias_1
        ctx.ti_fc_0 = ti_fc_0
        ctx.ti_fc_1 = ti_fc_1
        ti_data.from_torch(input_data)
        ti_weight_0.from_torch(weight_0)
        ti_bias_0.from_torch(bias_0)
        ti_weight_1.from_torch(weight_1)
        ti_bias_1.from_torch(bias_1)
        ti_fc_0.fill(0)
        ti_fc_1.fill(0)
        ti_kernel()
        return ti_output_1.to_torch()

    @staticmethod
    def backward(ctx, grad_output_1):
        ti.clear_all_gradients()
        grad_input_data = grad_weight_0 = grad_bias_0 = grad_weight_1 = grad_bias_1 = None
        ctx.ti_output_1.grad.from_torch(grad_output_1)
        ctx.ti_kernel.grad()

        if ctx.needs_input_grad[0]:
            grad_input_data = ctx.ti_data.grad.to_torch()
        if ctx.needs_input_grad[1]:
            grad_weight_0 = ctx.ti_weight_0.grad.to_torch()
        if ctx.needs_input_grad[2]:
            grad_bias_0 = ctx.ti_bias_0.grad.to_torch()
        if ctx.needs_input_grad[3]:
            grad_weight_1 = ctx.ti_weight_1.grad.to_torch()
        if ctx.needs_input_grad[4]:
            grad_bias_1 = ctx.ti_bias_1.grad.to_torch()

        return grad_input_data, grad_weight_0, grad_bias_0, grad_weight_1, grad_bias_1,\
               None, None, None, None, None, None, None, None, None, None

@ti.data_oriented
class Linear(nn.Module):
    def __init__(self, input_feature, hidden_feature, output_feature):
        super().__init__()
        # taichi parameter holders
        self.ti_data = ti.var(dt=real, shape=(batch_size, input_feature), needs_grad=True)
        self.ti_weight_0 = ti.var(dt=real, shape=(input_feature, hidden_feature), needs_grad=True)
        self.ti_bias_0 = ti.var(dt=real, shape=hidden_feature, needs_grad=True)
        self.ti_fc_0 = ti.var(dt=real, shape=(batch_size, hidden_feature), needs_grad=True)
        self.ti_output_0 = ti.var(dt=real, shape=(batch_size, hidden_feature), needs_grad=True)
        self.ti_weight_1 = ti.var(dt=real, shape=(hidden_feature, out_feature), needs_grad=True)
        self.ti_bias_1 = ti.var(dt=real, shape=out_feature, needs_grad=True)
        self.ti_fc_1 = ti.var(dt=real, shape=(batch_size, out_feature), needs_grad=True)
        self.ti_output_1 = ti.var(dt=real, shape=(batch_size, out_feature), needs_grad=True)
        # torch parameters
        self.weight_0 = nn.Parameter(torch.Tensor(input_feature, hidden_feature))
        self.bias_0 = nn.Parameter(torch.Tensor(hidden_feature))
        self.weight_1 = nn.Parameter(torch.Tensor(hidden_feature, out_feature))
        self.bias_1 = nn.Parameter(torch.Tensor(out_feature))
        self.weight_0.data.normal_(0, math.sqrt(2. / hidden_feature / input_feature))
        self.weight_1.data.normal_(0, math.sqrt(2. / hidden_feature / output_feature))

    @ti.classkernel
    def linear_kernel(self):
        for i in range(batch_size):
            for j in range(hidden_feature):
                for k in range(input_feature):
                    self.ti_fc_0[i, j] += self.ti_data[i, k] * self.ti_weight_0[k, j]

        for i in range(batch_size):
            for j in range(hidden_feature):
                self.ti_output_0[i, j] = ti.max(self.ti_fc_0[i, j] + self.ti_bias_0[j], 0)

        for i in range(batch_size):
            for j in range(out_feature):
                for k in range(hidden_feature):
                    self.ti_fc_1[i, j] += self.ti_output_0[i, k] * self.ti_weight_1[k, j]

        for i in range(batch_size):
            for j in range(out_feature):
                self.ti_output_1[i, j] = self.ti_fc_1[i, j] + self.ti_bias_1[j]

    def forward(self, input_data):
        return LinearFunction.apply(input_data, self.weight_0, self.bias_0, self.weight_1, self.bias_1,
                                    self.ti_data, self.ti_weight_0, self.ti_bias_0, self.ti_weight_1, self.ti_bias_1,
                                    self.ti_output_0, self.ti_output_1, self.ti_fc_0, self.ti_fc_1, self.linear_kernel)


if __name__ == '__main__':
    batch_size = 4
    input_feature = 32
    hidden_feature = 64
    out_feature = 128
    data = torch.rand(batch_size, input_feature, dtype=torch.float32, requires_grad=True)
    linear = Linear(input_feature, hidden_feature, out_feature)

    test = gradcheck(linear, data, eps=1e-3, atol=1e-3)
    print(test)

One thing to keep in mind: differentiating mutable local variables is really tricky (and you need messy ti.static to make it work, which doesn’t scale). Accumulate to global variables instead.

Please also check out the kernel simplicity rule: https://taichi.readthedocs.io/en/latest/differentiable_programming.html

I just want to confirm two things:

  1. Can there be multiple outer loops?
  2. Once I use ti.static, it does not count as a loop?

Good questions.

  1. Yes.
  2. With ti.static the for loop will be unrolled, therefore it’s no longer a loop.