代码修了一些bug。。但是现在发现CPU和GPU跑出来的结果是不一样的。。。
求指导



其中kernel的部分如下,全部代码在上面的链接中
@ti.kernel
def aggregate_NS(self, minimum_cost: ti.types.ndarray(), cost_volume: ti.types.ndarray(), p1: ti.f32, p2: ti.f32):
for x, i in ti.ndrange(self.width, (1, self.height)):
min_cost_last = 1.0 * 2**30
for d in range(self.disparities):
min_cost_last = ti.min(min_cost_last, minimum_cost[i-1, x, d])
for d in range(self.disparities):
l1 = minimum_cost[i-1, x, d]
d_minus = ti.max(d-1, 0)
d_plus = ti.min(d+1, self.disparities - 1)
l2 = minimum_cost[i-1, x, d_minus] + p1
l3 = minimum_cost[i-1, x, d_plus] + p1
l4 = min_cost_last + p2
tmp_a = ti.min(l1, l2)
tmp_b = ti.min(l3, l4)
tmp = ti.min(tmp_a, tmp_b)
minimum_cost[i, x, d] = cost_volume[i, x, d] + tmp - min_cost_last
@ti.kernel
def aggregate_WE(self, minimum_cost: ti.types.ndarray(), cost_volume: ti.types.ndarray(), p1: ti.f32, p2: ti.f32):
for y, i in ti.ndrange(self.height, (1, self.width)):
min_cost_last = 1.0 * 2**30
for d in range(self.disparities):
min_cost_last = ti.min(min_cost_last, minimum_cost[y, i-1, d])
for d in range(self.disparities):
l1 = minimum_cost[y, i-1, d]
d_minus = ti.max(d-1, 0)
d_plus = ti.min(d+1, self.disparities - 1)
l2 = minimum_cost[y, i-1, d_minus] + p1
l3 = minimum_cost[y, i-1, d_plus] + p1
l4 = min_cost_last + p2
tmp_a = ti.min(l1, l2)
tmp_b = ti.min(l3, l4)
tmp = ti.min(tmp_a, tmp_b)
minimum_cost[y, i, d] = cost_volume[y, i, d] + tmp - min_cost_last
@ti.kernel
def aggregate_NW2SE(self, minimum_cost: ti.types.ndarray(), cost_volume: ti.types.ndarray(), p1: ti.f32, p2: ti.f32):
for line in range(self.height - 1):
for x in range(1, ti.min(self.width, self.height - 1 - line)):
y = x + line
min_cost_last = 1.0 * 2**30
for d in range(self.disparities):
min_cost_last = ti.min(min_cost_last, minimum_cost[y-1, x-1, d])
for d in range(self.disparities):
l1 = minimum_cost[y-1, x-1, d]
d_minus = ti.max(d-1, 0)
d_plus = ti.min(d+1, self.disparities - 1)
l2 = minimum_cost[y-1, x-1, d_minus] + p1
l3 = minimum_cost[y-1, x-1, d_plus] + p1
l4 = min_cost_last + p2
tmp_a = ti.min(l1, l2)
tmp_b = ti.min(l3, l4)
tmp = ti.min(tmp_a, tmp_b)
minimum_cost[y, x, d] = cost_volume[y, x, d] + tmp - min_cost_last
for line in range(self.width - 2):
for y in range(1, ti.min(self.height, self.width - 1 - line)):
x = y + line + 1
min_cost_last = 1.0 * 2**30
for d in range(self.disparities):
min_cost_last = ti.min(min_cost_last, minimum_cost[y-1, x-1, d])
for d in range(self.disparities):
l1 = minimum_cost[y-1, x-1, d]
d_minus = ti.max(d-1, 0)
d_plus = ti.min(d+1, self.disparities - 1)
l2 = minimum_cost[y-1, x-1, d_minus] + p1
l3 = minimum_cost[y-1, x-1, d_plus] + p1
l4 = min_cost_last + p2
tmp_a = ti.min(l1, l2)
tmp_b = ti.min(l3, l4)
tmp = ti.min(tmp_a, tmp_b)
minimum_cost[y, x, d] = cost_volume[y, x, d] + tmp - min_cost_last