- Can I put all var, vectors into a class of torch (nn.Module)?
- Does ti.classkernel work the same as an out kernel?
- Is there something like ti.classfunc?
Take a look at this: https://taichi.readthedocs.io/en/latest/odop.html
- Yes
- Yes, with an exception when you want to explicitly call its gradient version: it is
X.method(__gradient=True)
instead ofX.method.grad()
if you useti.Tape
when you don’t need to worry about this. - You can just use
ti.func
and pass inself
as a parameter.
Update: starting v0.3.7, you don’t need to worry about (2). Simply decorate the class with @ti.data_oriented
and call A.forward.grad()
.
I tried @ti.data_oriented on a torch nn.Module class and it gives an error:
TypeError: super(type, obj): obj must be an instance or subtype of type
And I cannot run the previous program. It says
Runtime initialized.
{}
{}
Traceback (most recent call last):
{'__gradient': True}
File "D:/reflex/src/learnable_p2g.py", line 146, in <module>
test = gradcheck(linear, (pc, data), eps=1e-3, atol=1e-3)
File "C:\Users\49446\AppData\Local\conda\conda\envs\py36\lib\site-packages\torch\autograd\gradcheck.py", line 279, in gradcheck
analytical, reentrant, correct_grad_sizes = get_analytical_jacobian(tupled_inputs, o, nondet_tol=nondet_tol)
File "C:\Users\49446\AppData\Local\conda\conda\envs\py36\lib\site-packages\torch\autograd\gradcheck.py", line 155, in get_analytical_jacobian
retain_graph=True, allow_unused=True)
File "C:\Users\49446\AppData\Local\conda\conda\envs\py36\lib\site-packages\torch\autograd\__init__.py", line 157, in grad
inputs, allow_unused)
File "C:\Users\49446\AppData\Local\conda\conda\envs\py36\lib\site-packages\torch\autograd\function.py", line 77, in apply
return self._forward_cls.backward(self, *args)
File "D:/reflex/src/learnable_p2g.py", line 29, in backward
ctx.interp_kernel(__gradient=True)
File "C:\Users\49446\AppData\Local\conda\conda\envs\py36\lib\site-packages\taichi\lang\kernel.py", line 359, in decorated
primal(*args, **kwargs)
File "C:\Users\49446\AppData\Local\conda\conda\envs\py36\lib\site-packages\taichi\lang\kernel.py", line 308, in __call__
assert len(kwargs) == 0, 'kwargs not supported for Taichi kernels'
AssertionError: kwargs not supported for Taichi kernels
Process finished with exit code 1
Please see the change log: https://github.com/yuanming-hu/taichi#updates
Also could you share with me a script to reproduce the conflict between @ti.data_oriented
and nn.Module
? Thanks!
import torch
from torch.autograd import gradcheck
import torch.nn.functional as F
from torch.autograd import gradcheck
import math
import torch.nn as nn
import taichi as ti
ti.get_runtime().set_default_fp(ti.f32)
real = ti.f32
ti.data_oriented
class LinearFunction(torch.autograd.Function):
@staticmethod
def forward(ctx, input_data, weight_0, bias_0, weight_1, bias_1,
ti_data, ti_weight_0, ti_bias_0, ti_weight_1, ti_bias_1, ti_output_0, ti_output_1, ti_kernel):
ctx.ti_output_1 = ti_output_1
ctx.ti_kernel = ti_kernel
ctx.ti_data = ti_data
ctx.ti_weight_0 = ti_weight_0
ctx.ti_bias_0 = ti_bias_0
ctx.ti_weight_1 = ti_weight_1
ctx.ti_bias_1 = ti_bias_1
ti_data.from_torch(input_data)
ti_weight_0.from_torch(weight_0)
ti_bias_0.from_torch(bias_0)
ti_weight_1.from_torch(weight_1)
ti_bias_1.from_torch(bias_1)
ti_kernel()
return ti_output_1.to_torch()
@staticmethod
def backward(ctx, grad_output_1):
ti.clear_all_gradients()
grad_input_data = grad_weight_0 = grad_bias_0 = grad_weight_1 = grad_bias_1 = None
ctx.ti_output_1.grad.from_torch(grad_output_1)
ctx.ti_kernel.grad()
if ctx.needs_input_grad[0]:
grad_input_data = ctx.ti_data.grad.to_torch()
if ctx.needs_input_grad[1]:
grad_weight_0 = ctx.ti_weight_0.grad.to_torch()
if ctx.needs_input_grad[2]:
grad_bias_0 = ctx.ti_bias_0.grad.to_torch()
if ctx.needs_input_grad[3]:
grad_weight_1 = ctx.ti_weight_1.grad.to_torch()
if ctx.needs_input_grad[4]:
grad_bias_1 = ctx.ti_bias_1.grad.to_torch()
return grad_input_data, grad_weight_0, grad_bias_0, grad_weight_1, grad_bias_1,\
None, None, None, None, None, None, None, None
ti.data_oriented
class Linear(nn.Module):
def __init__(self, input_feature, hidden_feature, output_feature):
super(Linear, self).__init__()
# taichi parameter holders
self.ti_data = ti.var(dt=real, shape=(batch_size, input_feature), needs_grad=True)
self.ti_weight_0 = ti.var(dt=real, shape=(input_feature, hidden_feature), needs_grad=True)
self.ti_bias_0 = ti.var(dt=real, shape=hidden_feature, needs_grad=True)
self.ti_output_0 = ti.var(dt=real, shape=(batch_size, hidden_feature), needs_grad=True)
self.ti_weight_1 = ti.var(dt=real, shape=(hidden_feature, out_feature), needs_grad=True)
self.ti_bias_1 = ti.var(dt=real, shape=out_feature, needs_grad=True)
self.ti_output_1 = ti.var(dt=real, shape=(batch_size, out_feature), needs_grad=True)
# torch parameters
self.weight_0 = nn.Parameter(torch.Tensor(input_feature, hidden_feature))
self.bias_0 = nn.Parameter(torch.Tensor(hidden_feature))
self.weight_1 = nn.Parameter(torch.Tensor(hidden_feature, out_feature))
self.bias_1 = nn.Parameter(torch.Tensor(out_feature))
self.weight_0.data.normal_(0, math.sqrt(2. / hidden_feature / input_feature))
self.weight_1.data.normal_(0, math.sqrt(2. / hidden_feature / output_feature))
@ti.classkernel
def linear_kernel(self):
for i in range(batch_size):
for j in ti.static(range(hidden_feature)):
dummy = 0.0
for k in range(input_feature):
dummy += self.ti_data[i, k] * self.ti_weight_0[k, j]
dummy += self.ti_bias_0[j]
self.ti_output_0[i, j] = ti.max(dummy, 0)
for j in ti.static(range(out_feature)):
dummy = 0.0
for k in range(hidden_feature):
dummy += self.ti_output_0[i, k] * self.ti_weight_1[k, j]
dummy += self.ti_bias_1[j]
self.ti_output_1[i, j] = dummy
def forward(self, input_data):
return LinearFunction.apply(input_data, self.weight_0, self.bias_0, self.weight_1, self.bias_1,
self.ti_data, self.ti_weight_0, self.ti_bias_0, self.ti_weight_1, self.ti_bias_1,
self.ti_output_0, self.ti_output_1, self.linear_kernel)
if __name__ == '__main__':
batch_size = 1
input_feature = 4
hidden_feature = 8
out_feature = 2
data = torch.rand(batch_size, input_feature, dtype=torch.float32, requires_grad=True)
linear = Linear(input_feature, hidden_feature, out_feature)
test = gradcheck(linear, data, eps=1e-3, atol=1e-4)
print(test)
Thanks!. I’m looking into the incompatibility. Two issues in this script:
- In python3 you can use
super()
without arguments - You need a
@
before the decorator
Oh, I forgot the @! However, it gives another error
ctx.ti_kernel.grad()
AttributeError: 'function' object has no attribute 'grad'
Maybe the issue is the function that needs grad is not a class method?
Yeah sorry about that. nn.Module
has some other issues with @ti.data_oriented. I fixed that in v0.3.8 and the new version will be released in 20 minutes.
It may take 10 more minutes for v0.3.8 to build: https://ci.appveyor.com/project/IteratorAdvance/taichi/builds/29730350
Btw the way you are writing the NN does not really scale to larger networks. Here’s a scalable version:
import torch
from torch.autograd import gradcheck
import torch.nn.functional as F
from torch.autograd import gradcheck
import math
import torch.nn as nn
import taichi as ti
ti.get_runtime().set_default_fp(ti.f32)
real = ti.f32
class LinearFunction(torch.autograd.Function):
@staticmethod
def forward(ctx, input_data, weight_0, bias_0, weight_1, bias_1,
ti_data, ti_weight_0, ti_bias_0, ti_weight_1, ti_bias_1, ti_output_0, ti_output_1, ti_fc_0, ti_fc_1, ti_kernel):
ctx.ti_output_1 = ti_output_1
ctx.ti_kernel = ti_kernel
ctx.ti_data = ti_data
ctx.ti_weight_0 = ti_weight_0
ctx.ti_bias_0 = ti_bias_0
ctx.ti_weight_1 = ti_weight_1
ctx.ti_bias_1 = ti_bias_1
ctx.ti_fc_0 = ti_fc_0
ctx.ti_fc_1 = ti_fc_1
ti_data.from_torch(input_data)
ti_weight_0.from_torch(weight_0)
ti_bias_0.from_torch(bias_0)
ti_weight_1.from_torch(weight_1)
ti_bias_1.from_torch(bias_1)
ti_fc_0.fill(0)
ti_fc_1.fill(0)
ti_kernel()
return ti_output_1.to_torch()
@staticmethod
def backward(ctx, grad_output_1):
ti.clear_all_gradients()
grad_input_data = grad_weight_0 = grad_bias_0 = grad_weight_1 = grad_bias_1 = None
ctx.ti_output_1.grad.from_torch(grad_output_1)
ctx.ti_kernel.grad()
if ctx.needs_input_grad[0]:
grad_input_data = ctx.ti_data.grad.to_torch()
if ctx.needs_input_grad[1]:
grad_weight_0 = ctx.ti_weight_0.grad.to_torch()
if ctx.needs_input_grad[2]:
grad_bias_0 = ctx.ti_bias_0.grad.to_torch()
if ctx.needs_input_grad[3]:
grad_weight_1 = ctx.ti_weight_1.grad.to_torch()
if ctx.needs_input_grad[4]:
grad_bias_1 = ctx.ti_bias_1.grad.to_torch()
return grad_input_data, grad_weight_0, grad_bias_0, grad_weight_1, grad_bias_1,\
None, None, None, None, None, None, None, None, None, None
@ti.data_oriented
class Linear(nn.Module):
def __init__(self, input_feature, hidden_feature, output_feature):
super().__init__()
# taichi parameter holders
self.ti_data = ti.var(dt=real, shape=(batch_size, input_feature), needs_grad=True)
self.ti_weight_0 = ti.var(dt=real, shape=(input_feature, hidden_feature), needs_grad=True)
self.ti_bias_0 = ti.var(dt=real, shape=hidden_feature, needs_grad=True)
self.ti_fc_0 = ti.var(dt=real, shape=(batch_size, hidden_feature), needs_grad=True)
self.ti_output_0 = ti.var(dt=real, shape=(batch_size, hidden_feature), needs_grad=True)
self.ti_weight_1 = ti.var(dt=real, shape=(hidden_feature, out_feature), needs_grad=True)
self.ti_bias_1 = ti.var(dt=real, shape=out_feature, needs_grad=True)
self.ti_fc_1 = ti.var(dt=real, shape=(batch_size, out_feature), needs_grad=True)
self.ti_output_1 = ti.var(dt=real, shape=(batch_size, out_feature), needs_grad=True)
# torch parameters
self.weight_0 = nn.Parameter(torch.Tensor(input_feature, hidden_feature))
self.bias_0 = nn.Parameter(torch.Tensor(hidden_feature))
self.weight_1 = nn.Parameter(torch.Tensor(hidden_feature, out_feature))
self.bias_1 = nn.Parameter(torch.Tensor(out_feature))
self.weight_0.data.normal_(0, math.sqrt(2. / hidden_feature / input_feature))
self.weight_1.data.normal_(0, math.sqrt(2. / hidden_feature / output_feature))
@ti.classkernel
def linear_kernel(self):
for i in range(batch_size):
for j in range(hidden_feature):
for k in range(input_feature):
self.ti_fc_0[i, j] += self.ti_data[i, k] * self.ti_weight_0[k, j]
for i in range(batch_size):
for j in range(hidden_feature):
self.ti_output_0[i, j] = ti.max(self.ti_fc_0[i, j] + self.ti_bias_0[j], 0)
for i in range(batch_size):
for j in range(out_feature):
for k in range(hidden_feature):
self.ti_fc_1[i, j] += self.ti_output_0[i, k] * self.ti_weight_1[k, j]
for i in range(batch_size):
for j in range(out_feature):
self.ti_output_1[i, j] = self.ti_fc_1[i, j] + self.ti_bias_1[j]
def forward(self, input_data):
return LinearFunction.apply(input_data, self.weight_0, self.bias_0, self.weight_1, self.bias_1,
self.ti_data, self.ti_weight_0, self.ti_bias_0, self.ti_weight_1, self.ti_bias_1,
self.ti_output_0, self.ti_output_1, self.ti_fc_0, self.ti_fc_1, self.linear_kernel)
if __name__ == '__main__':
batch_size = 4
input_feature = 32
hidden_feature = 64
out_feature = 128
data = torch.rand(batch_size, input_feature, dtype=torch.float32, requires_grad=True)
linear = Linear(input_feature, hidden_feature, out_feature)
test = gradcheck(linear, data, eps=1e-3, atol=1e-3)
print(test)
One thing to keep in mind: differentiating mutable local variables is really tricky (and you need messy ti.static
to make it work, which doesn’t scale). Accumulate to global variables instead.
Please also check out the kernel simplicity rule: https://taichi.readthedocs.io/en/latest/differentiable_programming.html
I just want to confirm two things:
- Can there be multiple outer loops?
- Once I use ti.static, it does not count as a loop?
Good questions.
- Yes.
- With ti.static the for loop will be unrolled, therefore it’s no longer a loop.