原子操作并行竞争问题

为什么原子操作有时候在GPU下运行得到的结果会不一样呢?比如这里的grid_ids_new1,每次运行完结果都不相同
@ti.kernel
def sort(grid_ids1:ti.types.ndarray(),grid_particles_num_temp1:ti.types.ndarray(),grid_particles_num1:ti.types.ndarray()):
temp = 0
#ti.loop_config(serialize=True)
for i in range(20907):
I = 20907 - 1 - i
base_offset = 0
if grid_ids1[I] - 1 >= 0:
base_offset = grid_particles_num1[grid_ids1[I]-1]
grid_ids_new1[I] = ti.atomic_sub(grid_particles_num_temp1[grid_ids1[I]], 1) -1 + base_offset
# grid_ids_new1[I] = grid_particles_num_temp1[grid_ids1[I]] -1 + base_offset
# temp = grid_particles_num_temp1[grid_ids1[I]]-1
# grid_particles_num_temp1[grid_ids1[I]] = grid_particles_num_temp1[grid_ids1[I]]-1#4#temp
# a = a+b会导致a不会更新,也就是说这里grid_particles_num_temp1一直是传入的数据,不会被更新
#print (temp)

#问题:在GPU下,每次运行,grid_ids_new1结果都不相同
sort(a1,b1,c1)