# CUDA中如何选择Block的尺寸

ti example中mpm3d, 为什么block_dim=16时比block_dim = 32/64/128 更快, 代码如下

``````import numpy as np

import taichi as ti

ti.init(arch=ti.gpu)

block_dim=16                               # block dim = 16/32/64/128??
#dim, n_grid, steps, dt = 2, 128, 20, 2e-4
#dim, n_grid, steps, dt = 2, 256, 32, 1e-4
dim, n_grid, steps, dt = 3, 64, 25, 4e-4
#dim, n_grid, steps, dt = 3, 64, 25, 2e-4
#dim, n_grid, steps, dt = 3, 128, 25, 8e-5

n_particles = 200000
dx = 1 / n_grid

p_rho = 1
p_vol = (dx * 0.5)**2
p_mass = p_vol * p_rho
gravity = 9.8
bound = 3
E = 400

F_x = ti.Vector.field(dim, float, n_particles)
F_v = ti.Vector.field(dim, float, n_particles)
F_C = ti.Matrix.field(dim, dim, float, n_particles)
F_J = ti.field(float, n_particles)

F_grid_v = ti.Vector.field(dim, float, (n_grid, ) * dim)
F_grid_m = ti.field(float, (n_grid, ) * dim)

neighbour = (3, ) * dim

@ti.kernel
def substep():
for I in ti.grouped(F_grid_m):
F_grid_v[I] = ti.zero(F_grid_v[I])
F_grid_m[I] = 0
ti.loop_config(block_dim=block_dim)
for p in F_x:
Xp = F_x[p] / dx
base = int(Xp - 0.5)
fx = Xp - base
w = [0.5 * (1.5 - fx)**2, 0.75 - (fx - 1)**2, 0.5 * (fx - 0.5)**2]
stress = -dt * 4 * E * p_vol * (F_J[p] - 1) / dx**2
affine = ti.Matrix.identity(float, dim) * stress + p_mass * F_C[p]
for offset in ti.static(ti.grouped(ti.ndrange(*neighbour))):
dpos = (offset - fx) * dx
weight = 1.0
for i in ti.static(range(dim)):
weight *= w[offset[i]][i]
F_grid_v[base +
offset] += weight * (p_mass * F_v[p] + affine @ dpos)
F_grid_m[base + offset] += weight * p_mass
for I in ti.grouped(F_grid_m):
if F_grid_m[I] > 0:
F_grid_v[I] /= F_grid_m[I]
F_grid_v[I][1] -= dt * gravity
cond = (I < bound) & (F_grid_v[I] < 0) | \
(I > n_grid - bound) & (F_grid_v[I] > 0)
F_grid_v[I] = ti.select(cond, 0, F_grid_v[I])
ti.loop_config(block_dim=block_dim)
for p in F_x:
Xp = F_x[p] / dx
base = int(Xp - 0.5)
fx = Xp - base
w = [0.5 * (1.5 - fx)**2, 0.75 - (fx - 1)**2, 0.5 * (fx - 0.5)**2]
new_v = ti.zero(F_v[p])
new_C = ti.zero(F_C[p])
for offset in ti.static(ti.grouped(ti.ndrange(*neighbour))):
dpos = (offset - fx) * dx
weight = 1.0
for i in ti.static(range(dim)):
weight *= w[offset[i]][i]
g_v = F_grid_v[base + offset]
new_v += weight * g_v
new_C += 4 * weight * g_v.outer_product(dpos) / dx**2
F_v[p] = new_v
F_x[p] += dt * F_v[p]
F_J[p] *= 1 + dt * new_C.trace()
F_C[p] = new_C

@ti.kernel
def init():
for i in range(n_particles):
F_x[i] = ti.Vector([ti.random() for i in range(dim)]) * 0.4 + 0.15
F_J[i] = 1

def T(a):
if dim == 2:
return a

a = a - 0.5
x, y, z = a[:, 0], a[:, 1], a[:, 2]
cp, sp = np.cos(phi), np.sin(phi)
ct, st = np.cos(theta), np.sin(theta)
x, z = x * cp + z * sp, z * cp - x * sp
u, v = x, y * ct + z * st
return np.array([u, v]).swapaxes(0, 1) + 0.5

def main():
init()
gui = ti.GUI('MPM3D', background_color=0x112F41)
while gui.running and not gui.get_event(gui.ESCAPE):
for s in range(steps):
substep()
pos = F_x.to_numpy()
if export_file:
writer = ti.tools.PLYWriter(num_vertices=n_particles)
writer.add_vertex_pos(pos[:, 0], pos[:, 1], pos[:, 2])
writer.export_frame(gui.frame, export_file)