我尝试写了一个3D版本的 mpm88
:
import taichi as ti
import numpy as np
ti.init(arch=ti.opengl)
dim = 3
n_grid = 64
n_particles = n_grid ** dim // 2 ** (dim - 1)
dx = 1 / n_grid
dt = 2e-4
p_rho = 1
p_vol = (dx * 0.5)**2
p_mass = p_vol * p_rho
gravity = 9.8
bound = 3
E = 400
x = ti.Vector.field(dim, float, n_particles)
v = ti.Vector.field(dim, float, n_particles)
C = ti.Matrix.field(dim, dim, float, n_particles)
J = ti.field(float, n_particles)
grid_v = ti.Vector.field(dim, float, (n_grid,) * dim)
grid_m = ti.field(float, (n_grid,) * dim)
neighbour = (3,) * dim
@ti.kernel
def substep():
for I in ti.grouped(grid_m):
grid_v[I] = grid_v[I] * 0
grid_m[I] = 0
for p in x:
# read x, v, J, C; multi-read-write grid_v, grid_m
Xp = x[p] / dx
base = int(Xp - 0.5)
fx = Xp - base
w = [0.5 * (1.5 - fx)**2, 0.75 - (fx - 1)**2, 0.5 * (fx - 0.5)**2]
stress = -dt * 4 * E * p_vol * (J[p] - 1) / dx**2
affine = ti.Matrix.identity(float, dim) * stress + p_mass * C[p]
for offset in ti.static(ti.grouped(ti.ndrange(*neighbour))):
dpos = (offset - fx) * dx
weight = 1.0
for i in ti.static(range(dim)):
weight *= w[offset[i]][i]
grid_v[base + offset] += weight * (p_mass * v[p] + affine @ dpos)
grid_m[base + offset] += weight * p_mass
for I in ti.grouped(grid_m):
# read grid_m, grid_v; write grid_v
if grid_m[I] > 0:
grid_v[I] /= grid_m[I]
grid_v[I][1] -= dt * gravity
cond = I < bound and grid_v[I] < 0 or I > n_grid - bound and grid_v[I] > 0
grid_v[I] = 0 if cond else grid_v[I]
for p in x:
# read x, v, J, C, grid_v; write x, v, J, C
Xp = x[p] / dx
base = int(Xp - 0.5)
fx = Xp - base
w = [0.5 * (1.5 - fx)**2, 0.75 - (fx - 1)**2, 0.5 * (fx - 0.5)**2]
new_v = v[p] * 0
new_C = C[p] * 0
for offset in ti.static(ti.grouped(ti.ndrange(*neighbour))):
dpos = (offset - fx) * dx
weight = 1.0
for i in ti.static(range(dim)):
weight *= w[offset[i]][i]
g_v = grid_v[base + offset]
new_v += weight * g_v
new_C += 4 * weight * g_v.outer_product(dpos) / dx**2
v[p] = new_v
x[p] += dt * v[p]
J[p] *= 1 + dt * new_C.trace()
C[p] = new_C
@ti.kernel
def init():
for i in range(n_particles):
x[i] = ti.Vector([ti.random() for i in range(dim)]) * 0.4 + 0.2
J[i] = 1
def T(a):
if dim == 2:
return a
phi, theta = np.radians(28), np.radians(32)
a = a - 0.5
x, y, z = a[:, 0], a[:, 1], a[:, 2]
c, s = np.cos(phi), np.sin(phi)
C, S = np.cos(theta), np.sin(theta)
x, z = x * c + z * s, z * c - x * s
u, v = x, y * C + z * S
return np.array([u, v]).swapaxes(0, 1) + 0.5
init()
gui = ti.GUI('MPM3D', background_color=0x112F41)
while gui.running and not gui.get_event(gui.ESCAPE):
for s in range(20):
substep()
pos = x.to_numpy()
gui.circles(T(pos), radius=2, color=0x66ccff)
gui.show()
然后,我惊讶地发现在独显和集显上跑的 FPS 居然完全一样!落地时都在 2 左右。
这是不是意味着这个程序已经是严重的 memory-bound,因而速度瓶颈完全在于内存?我该如何消除这个瓶颈?
我在 CUDA 上尝试了kernel_profiler
,发现是大部分时间花在 P2G 上了,似乎是我们浮点 atomic 的实现需要多次读写和其他线程碰撞,导致内存带宽利用率降低?