稀疏矩阵求解器不能在cpu模式下计算64位浮点数

在 1.5.0版本中,我试图将implicit_fem.py例程中的稀疏矩阵求解器使用cpu float64位求解
ti.init(arch=ti.cpu,default_fp=ti.f64)
其余f32也都改成了f64,为了匹配scene.mesh,我写了如下转换程序,运行正常

    @ti.kernel
    def turn_f32():
        for X in F_x:
            F_x_s[X] = ti.cast(F_x[X],ti.f32)

但是v = solver.solve(F_b_ndarr) 求解结果为0,这一情况并没有出现在选择CUDA 32位和x64 32位的时候。

查找了稀疏矩阵相关的说明,提到过稀疏矩阵求解器在x64模式下支持f64求解。以下是求解器设定:

    A_builder = ti.linalg.SparseMatrixBuilder(3 * n_verts,
                                              3 * n_verts,
                                              max_num_triplets=5000000)
    solver = ti.linalg.SparseSolver(ti.f64, "LLT")
1 个赞

代码如下

import argparse

import numpy as np
from taichi._lib import core as _ti_core

import taichi as ti
import math
import matplotlib.pyplot as plt

parser = argparse.ArgumentParser()
parser.add_argument('--exp',
                    choices=['implicit', 'explicit'],
                    default='implicit')
parser.add_argument('--sol',
                    choices=['NewtonEuler', 'BDF2','BDF1','RK4'],
                    default='NewtonEuler')
parser.add_argument('--dim', type=int, default=3)
parser.add_argument('--gui', choices=['auto', 'ggui', 'cpu'], default='auto')
parser.add_argument('-s',
                    '--use_sparse',
                    action='store_true',
                    help='Use sparse matrix and sparse solver')
parser.add_argument('place_holder', nargs='*')
args = parser.parse_args()

ti.init(arch=ti.cpu,default_fp=ti.f64)

if args.gui == 'auto':
    if _ti_core.GGUI_AVAILABLE and ti.lang.impl.current_cfg().arch == ti.cuda:
        args.gui = 'ggui'
    else:
        args.gui = 'cpu'

E, nu = 69e9, 0.33 #Pa
mu, la = E / (2 * (1 + nu)), E * nu / ((1 + nu) * (1 - 2 * nu))  # lambda = 0
density = 2700.0 #kg/m^3 
dt = 1e-5 #s

subStep = 1000 #10s
plotX = np.linspace(0.0,1.0,subStep)
plotYx = np.linspace(0.0,0.0,subStep)
plotYy = np.linspace(0.0,0.0,subStep)
plotYz = np.linspace(0.0,0.0,subStep)
simuStep = 0

if args.exp == 'implicit':
    dt = 1e-2 #s

use_sparse = args.use_sparse

n_cube = np.array([2,10,2])
n_verts = np.product(n_cube)
n_cells = 5 * np.product(n_cube - 1)
dx = 3 / (n_cube.max() - 1) #0.02*0.02*3

F_vertices = ti.Vector.field(4, dtype=ti.i32, shape=n_cells)

F_x = ti.Vector.field(args.dim, dtype=ti.f64, shape=n_verts)
F_x_s = ti.Vector.field(args.dim, dtype=ti.f32, shape=n_verts)
F_ox = ti.Vector.field(args.dim, dtype=ti.f64, shape=n_verts)
F_v = ti.Vector.field(args.dim, dtype=ti.f64, shape=n_verts)
F_f = ti.Vector.field(args.dim, dtype=ti.f64, shape=n_verts)
F_mul_ans = ti.Vector.field(args.dim, dtype=ti.f64, shape=n_verts)
F_m = ti.field(dtype=ti.f64, shape=n_verts)

F_x1 = ti.Vector.field(args.dim, dtype=ti.f64, shape=n_verts)
F_v0 = ti.Vector.field(args.dim, dtype=ti.f64, shape=n_verts)
F_v1 = ti.Vector.field(args.dim, dtype=ti.f64, shape=n_verts)
F_v2 = ti.Vector.field(args.dim, dtype=ti.f64, shape=n_verts)
F_v3 = ti.Vector.field(args.dim, dtype=ti.f64, shape=n_verts)
F_v4 = ti.Vector.field(args.dim, dtype=ti.f64, shape=n_verts)

n_cells = (n_cube - 1).prod() * 5
F_B = ti.Matrix.field(args.dim, args.dim, dtype=ti.f64, shape=n_cells)
F_W = ti.field(dtype=ti.f64, shape=n_cells)


@ti.func
def i2p(I):
    return (I.x * n_cube[1] + I.y) * n_cube[2] + I.z


@ti.func
def set_element(e, I, verts):
    for i in ti.static(range(args.dim + 1)):
        F_vertices[e][i] = i2p(I + (([verts[i] >> k
                                      for k in range(3)] ^ I) & 1))


@ti.kernel
def get_vertices():
    '''
    This kernel partitions the cube into tetrahedrons.
    Each unit cube is divided into 5 tetrahedrons.
    '''
    for I in ti.grouped(ti.ndrange(*(n_cube - 1))):
        e = ((I.x * (n_cube[1] - 1) + I.y) * (n_cube[2] - 1) + I.z) * 5
        for i, j in ti.static(enumerate([0, 3, 5, 6])):
            set_element(e + i, I, (j, j ^ 1, j ^ 2, j ^ 4))
        set_element(e + 4, I, (1, 2, 4, 7))
    for I in ti.grouped(ti.ndrange(*(n_cube))):
        F_ox[i2p(I)] = I * dx


@ti.func
def Ds(verts):
    return ti.Matrix.cols([F_x[verts[i]] - F_x[verts[3]] for i in range(3)])


@ti.func
def ssvd(F):
    U, sig, V = ti.svd(F)
    if U.determinant() < 0:
        for i in ti.static(range(3)):
            U[i, 2] *= -1
        sig[2, 2] = -sig[2, 2]
    if V.determinant() < 0:
        for i in ti.static(range(3)):
            V[i, 2] *= -1
        sig[2, 2] = -sig[2, 2]
    return U, sig, V


@ti.func
def get_force_func(c, verts):
    F = Ds(verts) @ F_B[c] 
    P = ti.Matrix.zero(ti.f64, 3, 3)
    U, sig, V = ssvd(F)
    P = 2 * mu * (F - U @ V.transpose())
    H = -F_W[c] * P @ F_B[c].transpose()
    for i in ti.static(range(3)):
        force = ti.Vector([H[j, i] for j in range(3)])
        F_f[verts[i]] += force
        F_f[verts[3]] -= force


@ti.kernel
def get_force():
    for c in F_vertices:
        get_force_func(c, F_vertices[c])
    for u in F_f:
        F_f[u].y -= 9.8 * F_m[u]


@ti.kernel
def matmul_cell(ret: ti.template(), vel: ti.template()):
    for i in ret:
        ret[i] = vel[i] * F_m[i]
    for c in F_vertices:
        verts = F_vertices[c]
        W_c = F_W[c]
        B_c = F_B[c]
        for u in range(4):
            for d in range(3):
                dD = ti.Matrix.zero(ti.f64, 3, 3)
                if u == 3:
                    for j in range(3):
                        dD[d, j] = -1
                else:
                    dD[d, u] = 1
                dF = dD @ B_c
                dP = 2.0 * mu * dF
                dH = -W_c * dP @ B_c.transpose()
                for i in range(3):
                    for j in range(3):
                        tmp = (vel[verts[i]][j] - vel[verts[3]][j])
                        ret[verts[u]][d] += -dt**2 * dH[j, i] * tmp


@ti.kernel
def add(ans: ti.template(), a: ti.template(), k: ti.f64, b: ti.template()):
    for i in ans:
        ans[i] = a[i] + k * b[i]


@ti.kernel
def dot(a: ti.template(), b: ti.template()) -> ti.f64:
    ans = 0.0
    for i in a:
        ans += a[i].dot(b[i])
    return ans


F_b = ti.Vector.field(3, dtype=ti.f64, shape=n_verts)
F_r0 = ti.Vector.field(3, dtype=ti.f64, shape=n_verts)
F_p0 = ti.Vector.field(3, dtype=ti.f64, shape=n_verts)

# ndarray version of F_b
F_b_ndarr = ti.ndarray(dtype=ti.f64, shape=3 * n_verts)
# stiffness matrix
A_builder = ti.linalg.SparseMatrixBuilder(3 * n_verts,
                                          3 * n_verts,
                                          max_num_triplets=5000000)
solver = ti.linalg.SparseSolver(ti.f64, "LLT")


@ti.kernel
def get_b():
    for i in F_b:
        F_b[i] = F_m[i] * F_v[i] + dt * F_f[i]


def cg():
    def mul(x):
        matmul_cell(F_mul_ans, x)
        return F_mul_ans

    get_force()
    get_b()
    mul(F_v)
    add(F_r0, F_b, -1, mul(F_v))

    d = F_p0
    d.copy_from(F_r0)
    r_2 = dot(F_r0, F_r0)
    n_iter = 50
    epsilon = 1e-6
    r_2_init = r_2
    r_2_new = r_2
    for _ in range(n_iter):
        q = mul(d)
        alpha = r_2_new / dot(d, q)
        add(F_v, F_v, alpha, d)
        add(F_r0, F_r0, -alpha, q)
        r_2 = r_2_new
        r_2_new = dot(F_r0, F_r0)
        if r_2_new <= r_2_init * epsilon**2:
            break
        beta = r_2_new / r_2
        add(d, F_r0, beta, d)
    F_f.fill(0)
    add(F_x, F_x, dt, F_v)


@ti.kernel
def compute_A(A: ti.types.sparse_matrix_builder()):
    # A = M - dt * dt * K
    for i in range(n_verts):
        for j in range(3):
            A[3 * i + j, 3 * i + j] += F_m[i]
    for c in F_vertices:
        verts = F_vertices[c]
        W_c = F_W[c]
        B_c = F_B[c]
        for u in range(4):
            for d in range(3):
                dD = ti.Matrix.zero(ti.f64, 3, 3)
                if u == 3:
                    for j in range(3):
                        dD[d, j] = -1
                else:
                    dD[d, u] = 1
                dF = dD @ B_c
                dP = 2.0 * mu * dF
                dH = -W_c * dP @ B_c.transpose()
                for i in range(3):
                    for j in range(3):
                        A[3 * verts[u] + d,
                          3 * verts[i] + j] += -dt**2 * dH[j, i]
                for i in range(3):
                    A[3 * verts[u] + d, 3 * verts[3] +
                      i] += -dt**2 * (-dH[i, 0] - dH[i, 1] - dH[i, 2])


@ti.kernel
def flatten(dest: ti.types.ndarray(), src: ti.template()):
    for i in range(n_verts):
        for j in range(3):
            dest[3 * i + j] = src[i][j]


@ti.kernel
def aggragate(dest: ti.template(), src: ti.types.ndarray()):
    for i in range(n_verts):
        for j in range(3):
            dest[i][j] = src[3 * i + j]


def direct(): 
    get_force() 
    get_b()     
    flatten(F_b_ndarr, F_b) 
    v = solver.solve(F_b_ndarr) 
    aggragate(F_v, v) 
    F_f.fill(0) 
    add(F_x, F_x, dt, F_v) 

@ti.kernel
def advect():
    for p in F_x:
        F_v[p] += dt * (F_f[p] / F_m[p])
        F_x[p] += dt * F_v[p]
        F_f[p] = ti.Vector([0, 0, 0])


@ti.kernel
def init():
    for u in F_x:
        F_x[u] = F_ox[u]
        F_v[u] = [0.0] * 3
        F_f[u] = [0.0] * 3
        F_m[u] = 0.0
    for c in F_vertices:
        F = Ds(F_vertices[c])
        F_B[c] = F.inverse() 
        F_W[c] = ti.abs(F.determinant()) / 6       
        for i in range(4):
            F_m[F_vertices[c][i]] += F_W[c] / 4 * density 
    for u in F_x:
        F_x[u].y += 1.0
    for u in F_x:
        F_x1[u] = F_x[u]
        F_v1[u] = F_v[u]



def init_A():
    compute_A(A_builder)
    A = A_builder.build()
    solver.analyze_pattern(A)
    solver.factorize(A)


@ti.kernel
def floor_bound():
    for u in F_x:
        if F_x[u].y < 0:
            F_x[u].y = 0
            if F_v[u].y < 0:
                F_v[u].y = 0


@ti.func
def check(u):
    ans = 0
    rest = u
    for i in ti.static(range(3)):
        k = rest % n_cube[2 - i]
        rest = rest // n_cube[2 - i]
        if k == 0:
            ans |= (1 << (i * 2))
        if k == n_cube[2 - i] - 1:
            ans |= (1 << (i * 2 + 1))
    return ans


def gen_indices():
    su = 0
    for i in range(3):
        su += (n_cube[i] - 1) * (n_cube[(i + 1) % 3] - 1)
    return ti.field(ti.i32, shape=2 * su * 2 * 3)


indices = gen_indices()


@ti.kernel
def get_indices():
    # calculate all the meshes on surface
    cnt = 0
    for c in F_vertices:
        if c % 5 != 4:
            for i in ti.static([0, 2, 3]):
                verts = [F_vertices[c][(i + j) % 4] for j in range(3)]
                sum_ = check(verts[0]) & check(verts[1]) & check(verts[2])
                if sum_:
                    m = ti.atomic_add(cnt, 1)
                    det = ti.Matrix.rows([
                        F_x[verts[i]] - [0.5, 1.5, 0.5] for i in range(3)
                    ]).determinant()
                    if det < 0:
                        tmp = verts[1]
                        verts[1] = verts[2]
                        verts[2] = tmp
                    indices[m * 3] = verts[0]
                    indices[m * 3 + 1] = verts[1]
                    indices[m * 3 + 2] = verts[2]


def plot_view():
    global simuStep,subStep
    plotYx[simuStep] = F_x[0].x
    plotYy[simuStep] = F_x[0].y
    plotYz[simuStep] = F_x[0].z
    simuStep += 1
    if(simuStep%100 == 0):
        print(simuStep)
    if (simuStep == subStep):
        plt.figure(num = 1)
        plt.plot(plotX,plotYx)
        plt.figure(num = 2)
        plt.plot(plotX,plotYy)
        plt.figure(num = 3)
        plt.plot(plotX,plotYz)
        plt.show()    

solve_step = 0

def substep():
    global solve_step
    if args.exp == 'explicit':
        for i in range(10):
            if args.sol != 'RK4':
                get_force()
                if args.sol == 'NewtonEuler':
                    advect()
            else:
                advect_RK4()
    else:
        if use_sparse:
            direct()
        elif args.sol == 'BDF2':
            if solve_step == 0:
                advect_BDF1()
            else:
                advect_BDF2()
            solve_step += 1
        elif args.sol == 'BDF1':
            advect_BDF1()
        else:
            cg()
    floor_bound()
    turn_f32()
    plot_view()


@ti.kernel
def turn_f32():
  for X in F_x:
        F_x_s[X] = ti.cast(F_x[X],ti.f32)


def main():
    get_vertices()
    init()
    get_indices()

    if use_sparse:
        init_A()

    if args.gui == 'ggui':
        res = (800, 600)
        window = ti.ui.Window("Implicit FEM", res, vsync=True)

        frame_id = 0
        canvas = window.get_canvas()
        scene = ti.ui.Scene()
        camera = ti.ui.Camera()
        camera.position(2.0, 2.0, 3.95)
        camera.lookat(0.5, 0.5, 0.5)
        camera.fov(55)

        def render():
            camera.track_user_inputs(window,
                                     movement_speed=0.03,
                                     hold_key=ti.ui.RMB)
            scene.set_camera(camera)

            scene.ambient_light((0.1, ) * 3)

            scene.point_light(pos=(-1.0, 0, -1.0), color=(1, 1, 1))
            scene.point_light(pos=(10.0, 10.0, 10.0), color=(2, 2, 2))

            scene.mesh(F_x_s, indices, color=(0.73, 0.33, 0.23))

            canvas.scene(scene)

        while window.running:
            frame_id += 1
            frame_id = frame_id % 256
            if args.exp == 'explicit':
                for i in range(10):
                    substep()
            else:
                substep()
            if window.is_pressed('r'):
                init()
            if window.is_pressed(ti.GUI.ESCAPE):
                break

            render()

            window.show()

    else:

        def T(a):

            phi, theta = np.radians(28), np.radians(32)

            a = a - 0.2
            x, y, z = a[:, 0], a[:, 1], a[:, 2]
            c, s = np.cos(phi), np.sin(phi)
            C, S = np.cos(theta), np.sin(theta)
            x, z = x * c + z * s, z * c - x * s
            u, v = x, y * C + z * S
            return np.array([u, v]).swapaxes(0, 1) + 0.5

        gui = ti.GUI('Implicit FEM')
        while gui.running:
            substep()
            if gui.get_event(ti.GUI.PRESS):
                if gui.event.key in [ti.GUI.ESCAPE, ti.GUI.EXIT]:
                    break
            if gui.is_pressed('r'):
                init()
            gui.clear(0x000000)
            gui.circles(T(F_x.to_numpy() / 3), radius=1.5, color=0xba543a)
            gui.show()


if __name__ == '__main__':
    main()

感谢提交整理问题!我们还在尝试复现调查这个问题。另外能够提供你运行代码出问题的具体 Taichi 版本和硬件平台信息吗?谢谢!

软件版本 version 1.5.0, llvm 15.0.1, commit 7b885c28, win, python 3.8.10
硬件 i7-10750H 16G GTX1650ti 4G
感谢

1 个赞

补充几个测试结果:

  1. CPU 后端返回的 solution 结果都是 0, 而 CUDA 后端则会出现 nan 和 0 混合,应该都是分解失败了
  2. Taichi 的 master branch 源码编译版本,1.5.0 版本,1.4.0 版本都可以复现这个问题
    具体原因还在调查中。。

检查了一下出现问题的原因是 SparseMatrixBuilder 没有指定数据精度,需要在原程序的这里修改一下:

A_builder = ti.linalg.SparseMatrixBuilder(3 * n_verts,
                                          3 * n_verts,
                                          dtype=ti.f64,  # Specify data type explicitly
                                          max_num_triplets=5000000)

然后在 CPU 后端应该就可以运行了。CUDA 后端的确还没有支持 64 位的 SparseMatrix,所以暂时是无法修改成 64 位精度计算的。
一个小建议是在程序里可以把数据精度赋给一个变量,这样修改起来会容易很多。比如这样:

real = ti.f64  # Or ti.f32
Fx = ti.field(dtype=real, shape=n)
Fy = ti.field(dtype=real, shape=n)
...

非常感谢,程序运行有效