cpu与gpu结果不一致：cpu下图像每隔512行，有一行坏线；gpu每隔20多行，有一行坏线

nolca · 2024 年11 月 17 日 11:32

在做dithering仿色抖动练习，使用floyd steinberg算法。
请各位师傅指点下，能怎么改代码，能消除这条坏线

复现

python原生cpu，计算正确

运行 time ./cpu_no_taichi.py，耗时50s。

cpu_no_taichi.py

#!/bin/env python
TAICHI = False
import cv2
import numpy as np

def static(x):
    return x
if TAICHI:
    import taichi as ti
    from taichi import static
    ti.init(arch=ti.cpu,debug=False,offline_cache=False)

DEBUG=True
MAX=255
np.set_printoptions(threshold=np.inf, linewidth=180)  # numpy打印选项
img_from='./input.jpg'

def show_image(img):
    """显示图片"""
    if isinstance(img, str):
        img = cv2.imread(img)
    elif img.dtype != 'uint8':
        img = np.clip(img, 0, MAX)  # 将图片像素值限制在 0~255 之间
        img = img.astype(np.uint8)
    
    cv2.imshow('Image', img)
    while True:
        key = cv2.waitKey(1) & 0xFF
        if key == 27:  # 27 是 ESC 键的 ASCII 码
            break
    cv2.destroyAllWindows()


PALETTE=[       # RGB
0x000000, 0x0000AA, 0x00AA00, 0x00AAAA,
0xAA0000, 0xAA00AA, 0xAA5500, 0xAAAAAA,
0x555555, 0x5555FF, 0x55FF55, 0x55FFFF,
0xFF5555, 0xFF55FF, 0xFFFF55, 0xFFFFFF
]
PALETTE = [(c & 0xFF, c>>8 & 0xFF, c>>16 & 0xFF) for c in PALETTE]  #BGR
print('PALETTE自定义调色板',PALETTE) if DEBUG else None

type_img2d = object
type_bayerM = object
if TAICHI:
    type_img2d = ti.types.ndarray(dtype=ti.math.vec3,ndim=2)
    type_bayerM = ti.types.ndarray(element_dim=0,ndim=2)

# @ti.func
def clamp(x, min, max):
    # return ti.math.clamp(x, min, max)       # taichi
    return np.clip(x, min, max)

# @ti.func
def get_closest_bgr(old_pixel):
    # new_pixel = ti.Vector([0,0,0],dt=ti.i32)    # taichi
    new_pixel = (0,0,0)                         # cpu
    min_distance = 0xFFFFFFF

    r, g, b = old_pixel
    for c in static(PALETTE):
        cr, cg, cb = c
        distance = (r - cr) ** 2 + (g - cg) ** 2 + (b - cb) ** 2
        if distance < min_distance:
            min_distance = distance
            new_pixel = c
    return new_pixel

# @ti.kernel
def dither_floyd(img:type_img2d):
    PICK = 512
    h = img.shape[0]
    w = img.shape[1]
    for i in range(h):
        print(f'{i}/{h}  \t{i/h*100:.2f}%',end='\r')
        for j in range(w):

            flag=False
            for k in range(3):
                if img[i, j][k] < 0 and i%PICK==0 and j%PICK==0:
                    print(f'🔍 img[{i},{j}]→{img[i, j]}',end='→')
                    flag=True
                    break
            
            oldpixel = img[i, j]
            oldpixel = clamp(img[i, j],0,MAX)
            newpixel = clamp(get_closest_bgr(oldpixel),0,MAX)
            quant_error = oldpixel - newpixel
            img[i, j] = newpixel
            if flag==True and i%PICK==0 and j%PICK==0:
                print(f'newpixel={newpixel}\tquant_error={quant_error}')

            if j+1<w:
                img[i, j + 1] += quant_error * 7 // 16
            if i+1<h:
                if j-1>=0:
                    img[i + 1, j - 1] += quant_error * 3 // 16
                img[i + 1, j] += quant_error * 5 // 16
                if j+1<w:
                    img[i + 1, j + 1] += quant_error // 16
    print(f'🔍 img[{PICK},{PICK}]→{img[PICK, PICK]}')

def gen_bgr_256(value=MAX,show_raw=False):
    """生成256色BGR调色板"""
    scope=256
    img = np.zeros((256, 256, 3), dtype=np.uint8)
    for i in range(scope):
        for j in range(scope):
            img[i, j] = (j, i, value)
    img = cv2.cvtColor(img, cv2.COLOR_HLS2BGR)
    show_image(img) if show_raw else None
    return img

n = 3
# img = cv2.imread(img_from)
img = gen_bgr_256()
img = cv2.resize(img, (img.shape[1] * n, img.shape[0] * n), interpolation=cv2.INTER_NEAREST) # 放大n倍
img = img.astype(np.int32)  # 转换为 int32 类型
dither_floyd(img)
show_image(img)

taichi_cpu，图像第512行，有一条横的坏线，耗时1.6s

cpu_taichi.py

#!/bin/env python
TAICHI = True
import cv2
import numpy as np

def static(x):
    return x
if TAICHI:
    import taichi as ti
    from taichi import static
    ti.init(arch=ti.cpu,debug=False,offline_cache=False)

DEBUG=True
MAX=255
np.set_printoptions(threshold=np.inf, linewidth=180)  # numpy打印选项
img_from='./input.jpg'

def show_image(img):
    """显示图片"""
    if isinstance(img, str):
        img = cv2.imread(img)
    elif img.dtype != 'uint8':
        img = np.clip(img, 0, MAX)  # 将图片像素值限制在 0~255 之间
        img = img.astype(np.uint8)
    
    cv2.imshow('Image', img)
    while True:
        key = cv2.waitKey(1) & 0xFF
        if key == 27:  # 27 是 ESC 键的 ASCII 码
            break
    cv2.destroyAllWindows()


PALETTE=[       # RGB
0x000000, 0x0000AA, 0x00AA00, 0x00AAAA,
0xAA0000, 0xAA00AA, 0xAA5500, 0xAAAAAA,
0x555555, 0x5555FF, 0x55FF55, 0x55FFFF,
0xFF5555, 0xFF55FF, 0xFFFF55, 0xFFFFFF
]
PALETTE = [(c & 0xFF, c>>8 & 0xFF, c>>16 & 0xFF) for c in PALETTE]  #BGR
print('PALETTE自定义调色板',PALETTE) if DEBUG else None

type_img2d = object
type_bayerM = object
if TAICHI:
    type_img2d = ti.types.ndarray(dtype=ti.math.vec3,ndim=2)
    type_bayerM = ti.types.ndarray(element_dim=0,ndim=2)

@ti.func
def clamp(x, min, max):
    return ti.math.clamp(x, min, max)       # taichi
    return np.clip(x, min, max)

@ti.func
def get_closest_bgr(old_pixel):
    new_pixel = ti.Vector([0,0,0],dt=ti.i32)    # taichi
    # new_pixel = (0,0,0)                         # cpu
    min_distance = 0xFFFFFFF

    r, g, b = old_pixel
    for c in static(PALETTE):
        cr, cg, cb = c
        distance = (r - cr) ** 2 + (g - cg) ** 2 + (b - cb) ** 2
        if distance < min_distance:
            min_distance = distance
            new_pixel = c
    return new_pixel

@ti.kernel
def dither_floyd(img:type_img2d):
    PICK = 512
    h = img.shape[0]
    w = img.shape[1]
    for i in range(h):
        print(f'{i}/{h}  \t{i/h*100:.2f}%',end='\r')
        for j in range(w):

            flag=False
            for k in range(3):
                if img[i, j][k] < 0 and i%PICK==0 and j%PICK==0:
                    print(f'🔍 img[{i},{j}]→{img[i, j]}',end='→')
                    flag=True
                    break
            
            oldpixel = img[i, j]
            oldpixel = clamp(img[i, j],0,MAX)
            newpixel = clamp(get_closest_bgr(oldpixel),0,MAX)
            quant_error = oldpixel - newpixel
            img[i, j] = newpixel
            if flag==True and i%PICK==0 and j%PICK==0:
                print(f'newpixel={newpixel}\tquant_error={quant_error}')

            if j+1<w:
                img[i, j + 1] += quant_error * 7 // 16
            if i+1<h:
                if j-1>=0:
                    img[i + 1, j - 1] += quant_error * 3 // 16
                img[i + 1, j] += quant_error * 5 // 16
                if j+1<w:
                    img[i + 1, j + 1] += quant_error // 16
    print(f'🔍 img[{PICK},{PICK}]→{img[PICK, PICK]}')

def gen_bgr_256(value=MAX,show_raw=False):
    """生成256色BGR调色板"""
    scope=256
    img = np.zeros((256, 256, 3), dtype=np.uint8)
    for i in range(scope):
        for j in range(scope):
            img[i, j] = (j, i, value)
    img = cv2.cvtColor(img, cv2.COLOR_HLS2BGR)
    show_image(img) if show_raw else None
    return img

n = 3
# img = cv2.imread(img_from)
img = gen_bgr_256()
img = cv2.resize(img, (img.shape[1] * n, img.shape[0] * n), interpolation=cv2.INTER_NEAREST) # 放大n倍
img = img.astype(np.int32)  # 转换为 int32 类型
dither_floyd(img)
show_image(img)

taichi_gpu，不止一条坏线，且间隔随机

将 cpu_taichi.py 第11行ti.cpu改为ti.gpu，耗时1.5s

cpu、gpu不一致

论坛参考：

nolca · 2024 年11 月 17 日 11:34

python原生CPU：

nolca · 2024 年11 月 17 日 11:35

gpu+taichi：

nolca · 2024 年11 月 17 日 11:43

可以指定 img_from
在大于512行的图像，发现cpu_taichi.py，每512行会固定有一行坏线。
而gpu，坏线的出现比较随机。

贴一下shell打印结果：

❯ time ./dt.py
[Taichi] version 1.7.2, llvm 15.0.4, commit 0131dce9, linux, python 3.12.2
[Taichi] Starting on arch=x64
PALETTE自定义调色板 [(0, 0, 0), (170, 0, 0), (0, 170, 0), (170, 170, 0), (0, 0, 170), (170, 0, 170), (0, 85, 170), (170, 170, 170), (85, 85, 85), (255, 85, 85), (85, 255, 85), (255, 255, 85), (85, 85, 255), (255, 85, 255), (85, 255, 255), (255, 255, 255)]
/home/n/miniconda3/lib/python3.12/site-packages/taichi/lang/ast/ast_transformer.py:393: DeprecationWarning: Attribute s is deprecated and will be removed in Python 3.14; use value instead
  format_str = values[0].s
🔍 img[512,512]→[96, 61, 255]
./dt.py  0.54s user 0.15s system 65% cpu 1.051 total

nolca · 2024 年11 月 17 日 11:47

目前暂时将计就计，每512行再重新计算一次像素的BGR值：

for i in range(h):
    if i%512==0:
        for j in range(w):
            img[i, j],index = get_closest_bgr(img[i, j])

blues · 2025 年4 月 3 日 10:03

我也遇到了gpu和cpu计算结果不一致的问题，而且gpu计算会出现nan，cpu计算就没有nan，但是cpu计算的结果也不太对，是要多加点for循环来增加精度吗