求助,有限差分法,为什么不能并行呢

import taichi as ti
import numpy as np
import time
#ti.init(arch=ti.cuda, debug=True)

使用 NVIDIA GPU

@ti.data_oriented
class Polycrystal():
def init(self) :
self.Nx = 640
self.Ny = 640
self.dx = 1.0
self.dy = 1.0
self.nstep = 10000
self.nprint = 50
self.dtime = 1.0e-2
self.ttime = 0.0
self.noise = 0.02
self.c0 = 0.40
self.mobility = 1.0
self.grad_coef = 0.5
self.iflag = 1
self.con = ti.field(dtype=ti.f64, shape=(self.Nx, self.Ny))
self.lap_con = ti.field(dtype=ti.f64, shape=(self.Nx, self.Ny))
self.dummy = ti.field(dtype=ti.f64, shape=(self.Nx, self.Ny))
self.lap_dummy = ti.field(dtype=ti.f64, shape=(self.Nx, self.Ny))
self.total_energy = ti.field(dtype=ti.f64, shape=())
self.start_time = time.time()
# 字段定义
def micro_ch_pre(self):
if self.iflag == 1:
# 使用 NumPy 生成带有噪声的初始值
con_np = self.c0 + self.noise * (0.5 - np.random.rand(self.Nx, self.Ny))
self.con.from_numpy(con_np) # 将 NumPy 数组复制到 Taichi 字段
else:
# 另一种初始化方式(如果需要)
con_np = np.zeros((self.Nx, self.Ny), dtype=np.float64)
self.con.from_numpy(con_np)

# 调用函数进行初始化
# #计算拉普拉斯算子
@ti.kernel
def compute_laplacian(self):
    for i, j in self.con:
        ip = i + 1 if i < self.Nx - 1 else 0
        im = i - 1 if i > 0 else self.Nx - 1
        jp = j + 1 if j < self.Ny - 1 else 0
        jm = j - 1 if j > 0 else self.Ny - 1

        self.lap_con[i, j] = (self.con[ip, j] + self.con[im, j] + self.con[i, jp] + self.con[i, jm] - 4.0 * self.con[i, j]) / (self.dx * self.dy)
@ti.kernel
def compute_dummy_and_lap_dummy(self):
    for i, j in self.con:
        # 相邻索引的计算
        ip = i + 1 if i < self.Nx - 1 else 0
        im = i - 1 if i > 0 else self.Nx - 1
        jp = j + 1 if j < self.Ny - 1 else 0
        jm = j - 1 if j > 0 else self.Ny - 1

        # 计算 dummy
        dfdcon = self.free_energ_ch_v1(i, j)
        self.dummy[i, j] = dfdcon - self.grad_coef * self.lap_con[i, j]

        # 计算 lap_dummy
        hne = self.dummy[ip, j]
        hnw = self.dummy[im, j]
        hns = self.dummy[i, jm]
        hnn = self.dummy[i, jp]
        self.lap_dummy[i, j] = (hnw + hne + hns + hnn - 4.0 * self.dummy[i, j]) / (self.dx * self.dy)
@ti.kernel
def update_con(self):
    for i, j in self.con:
        self.con[i, j] += self.dtime * self.mobility * self.lap_dummy[i, j]
        self.con[i, j] = min(max(self.con[i, j], 0.00001), 0.99999)

@ti.func
def compute_energy_at_point(self, i, j):
    energy = 0.0
    if i < self.Nx - 1 and j < self.Ny - 1:
        ip, jp = i + 1, j + 1
        term1 = self.con[i, j]**2 * (1.0 - self.con[i, j])**2
        term2 = 0.5 * self.grad_coef * ((self.con[ip, j] - self.con[i, j])**2 + (self.con[i, jp] - self.con[i, j])**2)
        energy = term1 + term2
    return energy

@ti.kernel
def calculate_energ(self):
    for i, j in self.con:
        self.total_energy[None] += self.compute_energy_at_point(i, j)

@ti.func
def free_energ_ch_v1(self, i, j):
    A = 1.0
    dfdcon = A * (2.0 * self.con[i, j] * (1 - self.con[i, j])**2 - 2.0 * self.con[i, j]**2 * (1.0 - self.con[i, j]))
    return dfdcon
# 使用示例
#energy = calculate_energ(con, grad_coef)
#print(f"Total Energy: {energy}")

def write_vtk_grid_values(self, istep):
    fname = f'time_{istep}.vtk'
    nz = 1
    npoin = self.Nx * self.Ny * nz
    data1_np = self.con.to_numpy()
    with open(fname, 'w') as out:
        out.write('# vtk DataFile Version 2.0\n')
        out.write('time_10.vtk\n')
        out.write('ASCII\n')
        out.write('DATASET STRUCTURED_GRID\n')
        out.write(f'DIMENSIONS {self.Nx} {self.Ny} {nz}\n')
        out.write(f'POINTS {npoin} float\n')
        for i in range(self.Nx):
            for j in range(self.Ny):
                x = (i - 1) * self.dx
                y = (j - 1) * self.dy
                z = 0.0
                out.write(f'{x:.6e} {y:.6e} {z:.6e}\n')
        out.write(f'POINT_DATA {npoin}\n')
        out.write('SCALARS CON float 1\n')
        out.write('LOOKUP_TABLE default\n')
        for i in range(self.Nx):
            for j in range(self.Ny):
                out.write(f'{data1_np[i, j]:.6e}\n')

def write_results(self, istep):
    if istep % self.nprint == 0 or istep == 1:
        fname = f"time_{istep}.out"
        con_np = self.con.to_numpy()
        with open(fname, 'w') as f:
            for i in range(self.Nx):
                for j in range(self.Ny):
                    f.write(f"{i:5d} {j:5d} {con_np[i, j]:14.6e}\n")

if name == “main”:
ti.init(arch=ti.cuda, default_fp=ti.f64)
polycrys = Polycrystal()
polycrys.micro_ch_pre()
for istep in range(polycrys.nstep):
polycrys.compute_laplacian()
polycrys.compute_dummy_and_lap_dummy()
polycrys.update_con()
polycrys.ttime += polycrys.dtime
if istep % polycrys.nprint == 0 or istep == 1:
polycrys.total_energy[None] = 0.0
print(f"Current simulation time at step {istep}: {polycrys.ttime} seconds")
polycrys.write_results(istep)
polycrys.calculate_energ()
with open(‘time_energ.out’, ‘a’) as f:
f.write(f"{polycrys.ttime:14.6e} {polycrys.total_energy[None]:14.6e}\n")
polycrys.write_vtk_grid_values(istep)
end_time = time.time()
compute_time = end_time - polycrys.start_time
print(f"Compute Time: {compute_time} seconds")
相场法书中第一个案例,难受,cpu和cuda都试了,github试了试别的大佬的taichi都可以并行