# 菜鸡尝试用Taichi加速SGBM

``````    @ti.kernel
def aggregate_S(self, minimum_cost: ti.types.ndarray(), offset: int, p1: ti.f32, p2: ti.f32):
for x, i in ti.ndrange(self.width, (offset, self.height)):
self.min_cost_last[None] = 2**30
for d in range(self.disparities):
self.min_cost_last[None] = ti.min(self.min_cost_last[None], minimum_cost[i-1, x, d])

for d in range(self.disparities):
l1 = minimum_cost[i-1, x, d]

d_minus = ti.max(d-1, 0)
d_plus = ti.min(d+1, self.disparities - 1)
l2 = minimum_cost[i-1, x, d_minus] + p1
l3 = minimum_cost[i-1, x, d_plus] + p1

l4 = self.min_cost_last[None] + p2

tmp_a = ti.min(l1, l2)
tmp_b = ti.min(l3, l4)
tmp = ti.min(tmp_a, tmp_b)
minimum_cost[i, x, d] = self.cost_volume[i, x, d] + tmp - self.min_cost_last[None]

``````

1. 将backend从CPU换到GPU后，发现提速只有2倍不到（与预期相差有点远，但是taichi的CPU加速还是挺快的~）
2. 用GPU作为backend时，CPU消耗的memory也特别大：

图其实只有1430x984, depth的段数也就256段，讲道理就算是CPU跑也不应该需要这么大的memory

Hi @LiamLYJ , 请问可以给一个完整的代码，我们跑一下能够分析性能么？

@YuPeng ty~~

2 Likes

``````    @ti.kernel
def aggregate_NS(self, minimum_cost: ti.types.ndarray(), cost_volume: ti.types.ndarray(), p1: ti.f32, p2: ti.f32):
for x, i in ti.ndrange(self.width, (1, self.height)):
min_cost_last = 1.0 * 2**30
for d in range(self.disparities):
min_cost_last = ti.min(min_cost_last, minimum_cost[i-1, x, d])

for d in range(self.disparities):
l1 = minimum_cost[i-1, x, d]

d_minus = ti.max(d-1, 0)
d_plus = ti.min(d+1, self.disparities - 1)
l2 = minimum_cost[i-1, x, d_minus] + p1
l3 = minimum_cost[i-1, x, d_plus] + p1

l4 = min_cost_last + p2

tmp_a = ti.min(l1, l2)
tmp_b = ti.min(l3, l4)
tmp = ti.min(tmp_a, tmp_b)
minimum_cost[i, x, d] = cost_volume[i, x, d] + tmp - min_cost_last

@ti.kernel
def aggregate_WE(self, minimum_cost: ti.types.ndarray(), cost_volume: ti.types.ndarray(), p1: ti.f32, p2: ti.f32):
for y, i in ti.ndrange(self.height, (1, self.width)):
min_cost_last = 1.0 * 2**30
for d in range(self.disparities):
min_cost_last = ti.min(min_cost_last, minimum_cost[y, i-1, d])

for d in range(self.disparities):
l1 = minimum_cost[y, i-1, d]

d_minus = ti.max(d-1, 0)
d_plus = ti.min(d+1, self.disparities - 1)
l2 = minimum_cost[y, i-1, d_minus] + p1
l3 = minimum_cost[y, i-1, d_plus] + p1

l4 = min_cost_last + p2

tmp_a = ti.min(l1, l2)
tmp_b = ti.min(l3, l4)
tmp = ti.min(tmp_a, tmp_b)
minimum_cost[y, i, d] = cost_volume[y, i, d] + tmp - min_cost_last

@ti.kernel
def aggregate_NW2SE(self, minimum_cost: ti.types.ndarray(), cost_volume: ti.types.ndarray(), p1: ti.f32, p2: ti.f32):
for line in range(self.height - 1):
for x in range(1, ti.min(self.width, self.height - 1 - line)):
y = x + line
min_cost_last = 1.0 * 2**30
for d in range(self.disparities):
min_cost_last = ti.min(min_cost_last, minimum_cost[y-1, x-1, d])

for d in range(self.disparities):
l1 = minimum_cost[y-1, x-1, d]

d_minus = ti.max(d-1, 0)
d_plus = ti.min(d+1, self.disparities - 1)
l2 = minimum_cost[y-1, x-1, d_minus] + p1
l3 = minimum_cost[y-1, x-1, d_plus] + p1

l4 = min_cost_last + p2

tmp_a = ti.min(l1, l2)
tmp_b = ti.min(l3, l4)
tmp = ti.min(tmp_a, tmp_b)
minimum_cost[y, x, d] = cost_volume[y, x, d] + tmp - min_cost_last

for line in range(self.width - 2):
for y in range(1, ti.min(self.height, self.width - 1 - line)):
x = y + line + 1
min_cost_last = 1.0 * 2**30
for d in range(self.disparities):
min_cost_last = ti.min(min_cost_last, minimum_cost[y-1, x-1, d])

for d in range(self.disparities):
l1 = minimum_cost[y-1, x-1, d]

d_minus = ti.max(d-1, 0)
d_plus = ti.min(d+1, self.disparities - 1)
l2 = minimum_cost[y-1, x-1, d_minus] + p1
l3 = minimum_cost[y-1, x-1, d_plus] + p1

l4 = min_cost_last + p2

tmp_a = ti.min(l1, l2)
tmp_b = ti.min(l3, l4)
tmp = ti.min(tmp_a, tmp_b)
minimum_cost[y, x, d] = cost_volume[y, x, d] + tmp - min_cost_last

``````

1 Like