CUDA 共享内存问题（以及将 CUDA 与 python/ctypes 一起使用）

2023-12-27

不知怎的，当我修改时d_updated_water_flow_map在下面的代码中，d_terrain_height_map也被修改/相反。

更改两个数组的分配顺序可以解决问题，但我认为这只是掩盖了问题的根本原因。

cudaCheck(cudaMalloc((void **)&d_water_flow_map, SIZE * 4)); 
cudaCheck(cudaMalloc((void **)&d_updated_water_flow_map, SIZE * 4)); // changing this array also changes d_terrain_height_map
cudaCheck(cudaMalloc((void **)&d_terrain_height_map, SIZE));

我正在将内核编译成 DLL，并从 Blender 3D python 解释器内的下面的 python 文件调用它。所有值都是 32 位浮点数。

cu_include.h

#pragma once  

#ifdef MATHLIBRARY_EXPORTS  
#define MATHLIBRARY_API __declspec(dllexport)   
#else  
#define MATHLIBRARY_API __declspec(dllimport)   
#endif  


extern "C" __declspec(dllexport)
void init(float *t_height_map,
float *w_height_map,
float *s_height_map,
int SIZE_X,
int SIZE_Y);

extern "C" __declspec(dllexport)
void run_hydro_erosion(int cycles,
float t_step,
float min_tilt_angle,
float SEDIMENT_CAP,
float DISSOLVE_CONST,
float DEPOSIT_CONST,
int SIZE_X,
int SIZE_Y,
float PIPE_LENGTH,
float ADJACENT_LENGTH,
float TIME_STEP,
float MIN_TILT_ANGLE);

extern "C" __declspec(dllexport)
void free_mem();

extern "C" __declspec(dllexport)
void procedural_rain(float *water_height_map, float *rain_map, int SIZE_X, int SIZE_Y);

侵蚀内核.dll

#include "cu_include.h"

// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <time.h>
#include <iostream>
#include <algorithm>
#include <random>

// includes CUDA
#include <cuda_runtime.h>

using namespace std;

#define FLOW_RIGHT 0
#define FLOW_UP 1
#define FLOW_LEFT 2
#define FLOW_DOWN 3
#define X_VEL 0
#define Y_VEL 1
#define LEFT_CELL row, col - 1
#define RIGHT_CELL row, col + 1
#define ABOVE_CELL row - 1, col
#define BELOW_CELL row + 1, col

// CUDA API error checking macro
#define T 1024
#define M 1536
#define blockSize 1024
#define cudaCheck(error) \
  if (error != cudaSuccess) { \
    printf("Fatal error: %s at %s:%d\n", \
      cudaGetErrorString(error), \
      __FILE__, __LINE__); \
    exit(1); \
              }


__global__ void update_water_flow(float *water_height_map, float *water_flow_map, float *d_updated_water_flow_map, int SIZE_X, int SIZE_Y)
{
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int col = index % SIZE_X;
    int row = index / SIZE_X; 

    index = row * (SIZE_X * 4) + col * 4;   // 3D index
    d_updated_water_flow_map[index + FLOW_RIGHT] = 0;
    d_updated_water_flow_map[index + FLOW_UP] = 0;
    d_updated_water_flow_map[index + FLOW_LEFT] = 0;
    d_updated_water_flow_map[index + FLOW_DOWN] = 0;

}

static float *terrain_height_map;
static float *water_height_map;
static float *sediment_height_map;

void init(float *t_height_map,
    float *w_height_map,
    float *s_height_map,
    int SIZE_X,
    int SIZE_Y)
{
    /* set vars HOST*/
    terrain_height_map = t_height_map;
    water_height_map = w_height_map;
    sediment_height_map = s_height_map;
}

void run_hydro_erosion(int cycles,
    float t_step,
    float min_tilt_angle,
    float SEDIMENT_CAP,
    float DISSOLVE_CONST,
    float DEPOSIT_CONST,
    int SIZE_X,
    int SIZE_Y,
    float PIPE_LENGTH,
    float ADJACENT_LENGTH,
    float TIME_STEP,
    float MIN_TILT_ANGLE)
{ 
    int numBlocks = (SIZE_X * SIZE_Y + (blockSize - 1)) / blockSize;
    int SIZE = SIZE_X * SIZE_Y * sizeof(float);

    float *d_terrain_height_map, *d_updated_terrain_height_map;
    float *d_water_height_map, *d_updated_water_height_map;
    float *d_sediment_height_map, *d_updated_sediment_height_map;

    float *d_suspended_sediment_level;
    float *d_updated_suspended_sediment_level;
    float *d_water_flow_map;
    float *d_updated_water_flow_map;
    float *d_prev_water_height_map;
    float *d_water_velocity_vec;
    float *d_rain_map;

    cudaCheck(cudaMalloc(&d_water_height_map, SIZE));
    cudaCheck(cudaMalloc(&d_updated_water_height_map, SIZE));
    cudaCheck(cudaMalloc(&d_prev_water_height_map, SIZE));
    cudaCheck(cudaMalloc(&d_water_flow_map, SIZE * 4));
    cudaCheck(cudaMalloc(&d_updated_water_flow_map, SIZE * 4)); // changing this array also changes d_terrain_height_map
    cudaCheck(cudaMalloc(&d_terrain_height_map, SIZE));
    cudaCheck(cudaMalloc(&d_updated_terrain_height_map, SIZE));
    cudaCheck(cudaMalloc(&d_sediment_height_map, SIZE));
    cudaCheck(cudaMalloc(&d_updated_sediment_height_map, SIZE));
    cudaCheck(cudaMalloc(&d_suspended_sediment_level, SIZE));
    cudaCheck(cudaMalloc(&d_updated_suspended_sediment_level, SIZE));
    cudaCheck(cudaMalloc(&d_rain_map, SIZE));
    cudaCheck(cudaMalloc(&d_water_velocity_vec, SIZE * 2));

    cudaCheck(cudaMemcpy(d_terrain_height_map, terrain_height_map, SIZE, cudaMemcpyHostToDevice));
    cudaCheck(cudaMemcpy(d_water_height_map, water_height_map, SIZE, cudaMemcpyHostToDevice));
    cudaCheck(cudaMemcpy(d_sediment_height_map, sediment_height_map, SIZE, cudaMemcpyHostToDevice));

    cout << "init terrain_height_map" << endl;
    for (int i = 0; i < SIZE_X * SIZE_Y; i++) {
        cout << terrain_height_map[i] << ", ";
        if (i % SIZE_X == 0 && i != 0) cout << endl;
    }

    /* launch the kernel on the GPU */
    float *temp;
    while (cycles--) {
        update_water_flow << < numBlocks, blockSize >> >(d_water_height_map, d_water_flow_map, d_updated_water_flow_map, SIZE_X, SIZE_Y); 
        temp = d_water_flow_map;
        d_water_flow_map = d_updated_water_flow_map;
        d_updated_water_flow_map = temp;        
    }
    cudaCheck(cudaMemcpy(terrain_height_map, d_terrain_height_map, SIZE, cudaMemcpyDeviceToHost)); 


    cout << "updated terrain" << endl;
    for (int i = 0; i < SIZE_X * SIZE_Y; i++) {
        cout << terrain_height_map[i] << ", ";
        if (i % SIZE_X == 0 && i != 0) cout << endl;
    } 
}

Python文件

import bpy
import numpy
import ctypes
import random

width = 4
height = 4

size_x = width
size_y = height
N = size_x * size_y

scrpt_cycles = 1
kernel_cycles = 1
time_step = 0.005 
pipe_length = 1.0
adjacent_length = 1.0
min_tilt_angle = 10
sediment_cap = 0.01
dissolve_const = 0.01
deposit_const = 0.01

# initialize arrays
ter_height_map = numpy.ones((N), dtype=numpy.float32)
water_height_map = numpy.zeros((N), dtype=numpy.float32)
sed_height_map = numpy.zeros((N), dtype=numpy.float32)
rain_map = numpy.ones((N), dtype=numpy.float32)


# load terrain height from image
for i in range(0, len(ter_height_map)):
    ter_height_map[i] = 1


# import DLL
E = ctypes.cdll.LoadLibrary("E:/Programming/CUDA/erosion/Release/erosion_kernel.dll")

# initialize device memory
E.init( ctypes.c_void_p(ter_height_map.ctypes.data), 
        ctypes.c_void_p(water_height_map.ctypes.data),
        ctypes.c_void_p(sed_height_map.ctypes.data),
        ctypes.c_int(size_x),
        ctypes.c_int(size_y))


# run erosion
while(scrpt_cycles):
    scrpt_cycles = scrpt_cycles - 1  
    E.run_hydro_erosion(ctypes.c_int(kernel_cycles),
                        ctypes.c_float(time_step),
                        ctypes.c_float(min_tilt_angle), 
                        ctypes.c_float(sediment_cap), 
                        ctypes.c_float(dissolve_const), 
                        ctypes.c_float(deposit_const),
                        ctypes.c_int(size_x),
                        ctypes.c_int(size_y),
                        ctypes.c_float(pipe_length),
                        ctypes.c_float(adjacent_length),
                        ctypes.c_float(time_step),
                        ctypes.c_float(min_tilt_angle))

错误的输出：

预期输出（在我注释掉 update_water_flow 之后）：

//update_water_flow << < numBlocks, blockSize >> >(d_water_height_map, d_water_flow_map, d_updated_water_flow_map, SIZE_X, SIZE_Y);

显卡：GTX460M

（请注意，此答案中的代码还提供了如何在使用 python ctypes 与 python 应用程序共享的库中使用 CUDA 代码（例如 CUDA 设备内核）的完整配方/示例。如果您希望使用 CUDA 库功能，答案here https://stackoverflow.com/questions/47466589/cublasxt-matrix-multiply-succeeds-in-c-fails-in-python/47477758#47477758提供了一个使用 python ctypes 的示例。）

这里的问题是内核写入越界，显然编译器/运行时将分配定位在设备内存中足够接近，超出第一个分配的界限导致代码写入第二个分配：

cudaCheck(cudaMalloc(&d_updated_water_flow_map, SIZE * 4)); // changing this array also changes d_terrain_height_map
cudaCheck(cudaMalloc(&d_terrain_height_map, SIZE));

越界访问的发生是因为内核启动涉及足够多的线程（在本例中启动了 1024 个线程），而我们实际上只“需要”SIZE_X*SIZE_Y线程（即本例中的 16 个）：

#define blockSize 1024
...
int numBlocks = (SIZE_X * SIZE_Y + (blockSize - 1)) / blockSize;
...
update_water_flow << < numBlocks, blockSize >> >(d_water_height_map, d_water_flow_map, d_updated_water_flow_map, SIZE_X, SIZE_Y);

这当然是 CUDA 编程中的“典型”，即启动足够多的线程，但在执行此操作时，重要的是在内核中包含“线程检查”，以防止任何“额外”线程进行任何非法的、超出范围的操作。 - 限制访问。在这种情况下，一种可能的内核线程检查可能如下所示：

if ((row >= SIZE_Y) || (col >= SIZE_X)) return;

这是一个基于提供的代码的完整示例（尽管是在 Linux 上，并删除了 python 代码中的搅拌机依赖项），显示了前后效果。请注意，我们甚至可以运行这样的代码cuda-memcheck，这会指出这种情况下的越界访问（为了清楚起见，从下面的第一个示例中省略）：

$ cat t383.cu
extern "C"
void init(float *t_height_map,
float *w_height_map,
float *s_height_map,
int SIZE_X,
int SIZE_Y);

extern "C"
void run_hydro_erosion(int cycles,
float t_step,
float min_tilt_angle,
float SEDIMENT_CAP,
float DISSOLVE_CONST,
float DEPOSIT_CONST,
int SIZE_X,
int SIZE_Y,
float PIPE_LENGTH,
float ADJACENT_LENGTH,
float TIME_STEP,
float MIN_TILT_ANGLE);

extern "C"
void free_mem();

extern "C"
void procedural_rain(float *water_height_map, float *rain_map, int SIZE_X, int SIZE_Y);

// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <time.h>
#include <iostream>
#include <algorithm>
#include <random>

// includes CUDA
#include <cuda_runtime.h>

using namespace std;

#define FLOW_RIGHT 0
#define FLOW_UP 1
#define FLOW_LEFT 2
#define FLOW_DOWN 3
#define X_VEL 0
#define Y_VEL 1
#define LEFT_CELL row, col - 1
#define RIGHT_CELL row, col + 1
#define ABOVE_CELL row - 1, col
#define BELOW_CELL row + 1, col

// CUDA API error checking macro
#define T 1024
#define M 1536
#define blockSize 1024
#define cudaCheck(error) \
  if (error != cudaSuccess) { \
    printf("Fatal error: %s at %s:%d\n", \
      cudaGetErrorString(error), \
      __FILE__, __LINE__); \
    exit(1); \
              }


__global__ void update_water_flow(float *water_height_map, float *water_flow_map, float *d_updated_water_flow_map, int SIZE_X, int SIZE_Y)
{
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int col = index % SIZE_X;
    int row = index / SIZE_X;

    index = row * (SIZE_X * 4) + col * 4;   // 3D index
#ifdef FIX
    if ((row >= SIZE_Y) || (col >= SIZE_X)) return;
#endif
    d_updated_water_flow_map[index + FLOW_RIGHT] = 0;
    d_updated_water_flow_map[index + FLOW_UP] = 0;
    d_updated_water_flow_map[index + FLOW_LEFT] = 0;
    d_updated_water_flow_map[index + FLOW_DOWN] = 0;

}

static float *terrain_height_map;
static float *water_height_map;
static float *sediment_height_map;

void init(float *t_height_map,
    float *w_height_map,
    float *s_height_map,
    int SIZE_X,
    int SIZE_Y)
{
    /* set vars HOST*/
    terrain_height_map = t_height_map;
    water_height_map = w_height_map;
    sediment_height_map = s_height_map;
}

void run_hydro_erosion(int cycles,
    float t_step,
    float min_tilt_angle,
    float SEDIMENT_CAP,
    float DISSOLVE_CONST,
    float DEPOSIT_CONST,
    int SIZE_X,
    int SIZE_Y,
    float PIPE_LENGTH,
    float ADJACENT_LENGTH,
    float TIME_STEP,
    float MIN_TILT_ANGLE)
{
    int numBlocks = (SIZE_X * SIZE_Y + (blockSize - 1)) / blockSize;
    int SIZE = SIZE_X * SIZE_Y * sizeof(float);

    float *d_terrain_height_map, *d_updated_terrain_height_map;
    float *d_water_height_map, *d_updated_water_height_map;
    float *d_sediment_height_map, *d_updated_sediment_height_map;

    float *d_suspended_sediment_level;
    float *d_updated_suspended_sediment_level;
    float *d_water_flow_map;
    float *d_updated_water_flow_map;
    float *d_prev_water_height_map;
    float *d_water_velocity_vec;
    float *d_rain_map;

    cudaCheck(cudaMalloc(&d_water_height_map, SIZE));
    cudaCheck(cudaMalloc(&d_updated_water_height_map, SIZE));
    cudaCheck(cudaMalloc(&d_prev_water_height_map, SIZE));
    cudaCheck(cudaMalloc(&d_water_flow_map, SIZE * 4));
    cudaCheck(cudaMalloc(&d_updated_water_flow_map, SIZE * 4)); // changing this array also changes d_terrain_height_map
    cudaCheck(cudaMalloc(&d_terrain_height_map, SIZE));
    cudaCheck(cudaMalloc(&d_updated_terrain_height_map, SIZE));
    cudaCheck(cudaMalloc(&d_sediment_height_map, SIZE));
    cudaCheck(cudaMalloc(&d_updated_sediment_height_map, SIZE));
    cudaCheck(cudaMalloc(&d_suspended_sediment_level, SIZE));
    cudaCheck(cudaMalloc(&d_updated_suspended_sediment_level, SIZE));
    cudaCheck(cudaMalloc(&d_rain_map, SIZE));
    cudaCheck(cudaMalloc(&d_water_velocity_vec, SIZE * 2));

    cudaCheck(cudaMemcpy(d_terrain_height_map, terrain_height_map, SIZE, cudaMemcpyHostToDevice));
    cudaCheck(cudaMemcpy(d_water_height_map, water_height_map, SIZE, cudaMemcpyHostToDevice));
    cudaCheck(cudaMemcpy(d_sediment_height_map, sediment_height_map, SIZE, cudaMemcpyHostToDevice));

    cout << "init terrain_height_map" << endl;
    for (int i = 0; i < SIZE_X * SIZE_Y; i++) {
        cout << terrain_height_map[i] << ", ";
        if (i % SIZE_X == 0 && i != 0) cout << endl;
    }

    /* launch the kernel on the GPU */
    float *temp;
    while (cycles--) {
        update_water_flow << < numBlocks, blockSize >> >(d_water_height_map, d_water_flow_map, d_updated_water_flow_map, SIZE_X, SIZE_Y);
        temp = d_water_flow_map;
        d_water_flow_map = d_updated_water_flow_map;
        d_updated_water_flow_map = temp;
    }
    cudaCheck(cudaMemcpy(terrain_height_map, d_terrain_height_map, SIZE, cudaMemcpyDeviceToHost));


    cout << "updated terrain" << endl;
    for (int i = 0; i < SIZE_X * SIZE_Y; i++) {
        cout << terrain_height_map[i] << ", ";
        if (i % SIZE_X == 0 && i != 0) cout << endl;
    }
}
$ cat t383.py
import numpy
import ctypes
import random

width = 4
height = 4

size_x = width
size_y = height
N = size_x * size_y

scrpt_cycles = 1
kernel_cycles = 1
time_step = 0.005
pipe_length = 1.0
adjacent_length = 1.0
min_tilt_angle = 10
sediment_cap = 0.01
dissolve_const = 0.01
deposit_const = 0.01

# initialize arrays
ter_height_map = numpy.ones((N), dtype=numpy.float32)
water_height_map = numpy.zeros((N), dtype=numpy.float32)
sed_height_map = numpy.zeros((N), dtype=numpy.float32)
rain_map = numpy.ones((N), dtype=numpy.float32)


# load terrain height from image
for i in range(0, len(ter_height_map)):
    ter_height_map[i] = 1


# import DLL
E = ctypes.cdll.LoadLibrary("./t383.so")

# initialize device memory
E.init( ctypes.c_void_p(ter_height_map.ctypes.data),
        ctypes.c_void_p(water_height_map.ctypes.data),
        ctypes.c_void_p(sed_height_map.ctypes.data),
        ctypes.c_int(size_x),
        ctypes.c_int(size_y))


# run erosion
while(scrpt_cycles):
    scrpt_cycles = scrpt_cycles - 1
    E.run_hydro_erosion(ctypes.c_int(kernel_cycles),
                        ctypes.c_float(time_step),
                        ctypes.c_float(min_tilt_angle),
                        ctypes.c_float(sediment_cap),
                        ctypes.c_float(dissolve_const),
                        ctypes.c_float(deposit_const),
                        ctypes.c_int(size_x),
                        ctypes.c_int(size_y),
                        ctypes.c_float(pipe_length),
                        ctypes.c_float(adjacent_length),
                        ctypes.c_float(time_step),
                        ctypes.c_float(min_tilt_angle))
$ nvcc -Xcompiler -fPIC -std=c++11 -shared -arch=sm_61 -o t383.so t383.cu
$ python t383.py
init terrain_height_map
1, 1, 1, 1, 1,
1, 1, 1, 1,
1, 1, 1, 1,
1, 1, 1, updated terrain
0, 0, 0, 0, 0,
0, 0, 0, 0,
0, 0, 0, 0,
0, 0, 0, 
$ nvcc -Xcompiler -fPIC -std=c++11 -shared -arch=sm_61 -o t383.so t383.cu -DFIX
$ cuda-memcheck python t383.py
========= CUDA-MEMCHECK
init terrain_height_map
1, 1, 1, 1, 1,
1, 1, 1, 1,
1, 1, 1, 1,
1, 1, 1, updated terrain
1, 1, 1, 1, 1,
1, 1, 1, 1,
1, 1, 1, 1,
1, 1, 1, 
========= ERROR SUMMARY: 0 errors
$

如果我们在没有修复的情况下编译前面的示例，但运行它cuda-memcheck我们将得到指示越界访问的输出：

$nvcc -Xcompiler -fPIC -std=c++11 -shared -arch=sm_61 -o t383.so t383.cu
$ cuda-memcheck python t383.py
========= CUDA-MEMCHECK
init terrain_height_map
1, 1, 1, 1, 1,
1, 1, 1, 1,
1, 1, 1, 1,
========= Invalid __global__ write of size 4
=========     at 0x000002f0 in update_water_flow(float*, float*, float*, int, int)
=========     by thread (31,0,0) in block (0,0,0)
=========     Address 0x1050d6009f0 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x2c5) [0x204505]
=========     Host Frame:./t383.so [0x1c291]
=========     Host Frame:./t383.so [0x39e33]
=========     Host Frame:./t383.so [0x6879]
=========     Host Frame:./t383.so (_Z43__device_stub__Z17update_water_flowPfS_S_iiPfS_S_ii + 0xe3) [0x6747]
=========     Host Frame:./t383.so (_Z17update_water_flowPfS_S_ii + 0x38) [0x6781]
=========     Host Frame:./t383.so (run_hydro_erosion + 0x8f2) [0x648b]
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libffi.so.6 (ffi_call_unix64 + 0x4c) [0x5adc]
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libffi.so.6 (ffi_call + 0x1fc) [0x540c]
=========     Host Frame:/usr/lib/python2.7/lib-dynload/_ctypes.x86_64-linux-gnu.so (_ctypes_callproc + 0x48e) [0x145fe]
=========     Host Frame:/usr/lib/python2.7/lib-dynload/_ctypes.x86_64-linux-gnu.so [0x15f9e]
=========     Host Frame:python (PyEval_EvalFrameEx + 0x98d) [0x1244dd]
=========     Host Frame:python [0x167d14]
=========     Host Frame:python (PyRun_FileExFlags + 0x92) [0x65bf4]
=========     Host Frame:python (PyRun_SimpleFileExFlags + 0x2ee) [0x6612d]
=========     Host Frame:python (Py_Main + 0xb5e) [0x66d92]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf5) [0x21f45]
=========     Host Frame:python [0x177c2e]
=========
========= Invalid __global__ write of size 4
=========     at 0x000002f0 in update_water_flow(float*, float*, float*, int, int)
=========     by thread (30,0,0) in block (0,0,0)
=========     Address 0x1050d6009e0 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x2c5) [0x204505]
=========     Host Frame:./t383.so [0x1c291]
=========     Host Frame:./t383.so [0x39e33]
=========     Host Frame:./t383.so [0x6879]
=========     Host Frame:./t383.so (_Z43__device_stub__Z17update_water_flowPfS_S_iiPfS_S_ii + 0xe3) [0x6747]
=========     Host Frame:./t383.so (_Z17update_water_flowPfS_S_ii + 0x38) [0x6781]
=========     Host Frame:./t383.so (run_hydro_erosion + 0x8f2) [0x648b]
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libffi.so.6 (ffi_call_unix64 + 0x4c) [0x5adc]
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libffi.so.6 (ffi_call + 0x1fc) [0x540c]
=========     Host Frame:/usr/lib/python2.7/lib-dynload/_ctypes.x86_64-linux-gnu.so (_ctypes_callproc + 0x48e) [0x145fe]
=========     Host Frame:/usr/lib/python2.7/lib-dynload/_ctypes.x86_64-linux-gnu.so [0x15f9e]
=========     Host Frame:python (PyEval_EvalFrameEx + 0x98d) [0x1244dd]
=========     Host Frame:python [0x167d14]
=========     Host Frame:python (PyRun_FileExFlags + 0x92) [0x65bf4]
=========     Host Frame:python (PyRun_SimpleFileExFlags + 0x2ee) [0x6612d]
=========     Host Frame:python (Py_Main + 0xb5e) [0x66d92]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf5) [0x21f45]
=========     Host Frame:python [0x177c2e]
=========
... (output truncated for brevity of presentation)
========= ERROR SUMMARY: 18 errors
$

本文内容由网友自发贡献，版权归原作者所有，本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容，请联系:hwhale#tublm.com(使用前将#替换为@)

CUDA 共享内存问题（以及将 CUDA 与 python/ctypes 一起使用）的相关文章

从多个类访问串行端口

我正在尝试使用串行端口在 arduino 和 C 程序之间进行通信我对 C 编程有点陌生该程序有多种用户控制形式每一个都需要访问串口来发送数据我需要做的就是从每个类的主窗体中写入串行端口我了解如何设置和写入串行端口这是我的 Fo
无法在 osx-arm64 上安装 Python 3.7

我正在尝试使用 Conda 创建一个带有 Python 3 7 的新环境例如 conda create n qnn python 3 7 我收到以下错误 Collecting package metadata current repoda
IronPython：没有名为 json 的模块

我安装了 IronPython 我的 python 文件如下所示 import sys print sys version import json 运行它的代码 var p Python CreateEngine var scope p C
python Soap zeep模块获取结果

我从 SOAP API 得到如下结果 client zeep Client wsdl self wsdl transport transport auth header lb E authenticate self login res cl
创建嵌套字典单行

您好我有三个列表我想使用一行创建一个三级嵌套字典 i e l1 a b l2 1 2 3 l3 d e 我想创建以下嵌套字典 nd a 1 d 0 e 0 2 d 0 e 0 3 d 0 e 0 b a 1 d 0 e 0 2 d 0
如何从网站下载 .EXE 文件？

我正在编写一个应用程序需要从网站下载 exe 文件我正在使用 Visual Studio Express 2008 我正在使用以下代码 private void button1 Click object sender EventArgs
即使手动设置显示环境变量后，WSL Ubuntu 也会显示“错误：无法打开显示”

我在 WSL Ubuntu 上使用 g 我使用 git 克隆了 GLFW 存储库使用了ccmake命令配置并生成二进制文件然后使用make在 build 目录中最终创建 a文件我安装了所有OpenGL相关的库 usr ld 我不记得我
将数据打印到文件

我已经超载了 lt lt 运算符使其写入文件并写入控制台我已经为同一个函数创建了 8 个线程并且我想输出 hello hi 如果我在无限循环中运行这个线程例程文件中的o p是 hello hi hello hi hello hi e
C# 中条件编译符号的编译时检查（参见示例）？

在 C C 中你可以这样做 define IN USE 1 define NOT IN USE 1 define USING system 1 system 1 IN USE 进而 define MY SYSTEM IN USE if US
使用 PyTorch 分布式 NCCL 连接失败

我正在尝试使用 torch distributed 将 PyTorch 张量从一台机器发送到另一台机器 dist init process group 函数正常工作但是 dist broadcast 函数中出现连接失败这是我在节点 0
如何在c#中的内部类中访问外部类的变量[重复]

这个问题在这里已经有答案了我有两个类我需要声明两个类共有的变量如果是嵌套类我需要访问内部类中的外部类变量请给我一个更好的方法来在 C 中做到这一点示例代码 Class A int a Class B Need to access
Ubuntu 上的 Python 2.7

我是 Python 新手正在 Linux 机器 Ubuntu 10 10 上工作它正在运行 python 2 6 但我想运行 2 7 因为它有我想使用的功能有人敦促我不要安装 2 7 并将其设置为我的默认 python 我的问题是如
System.Runtime.InteropServices.COMException（0x80040154）：[关闭]

Closed 这个问题不符合堆栈溢出指南 help closed questions 目前不接受答案我在 C 项目中遇到异常 System Runtime InteropServices COMException 0x80040154 检
将代码拆分为标头/源文件

我从 Asio 的示例页面中获取了以下代码 class tcp connection public boost enable shared from this
限制 django 应用程序模型中的单个记录？

我想使用模型来保存 django 应用程序的系统设置因此我想限制该模型使其只能有一条记录极限怎么办尝试这个 class MyModel models Model onefield models CharField The fiel
检查字典键是否有空值

我有以下字典 dict1 city name yass region zipcode phone address tehsil planet mars 我正在尝试创建一个基于 dict1 的新字典但是它不会包含带有空字符串的键它不会包
剪贴板在 .NET 3.5 和 4 中的行为有所不同，但为什么呢？

我们最近将一个非常大的项目从 NET Framework 3 5 升级到 4 最初一切似乎都工作正常但现在复制粘贴操作开始出现错误我已经成功制作了一个小型的可复制应用程序它显示了 NET 3 5 和 4 中的不同行为我还找到了一种解
什么是 __declspec 以及何时需要使用它？

我见过这样的例子 declspec在我正在阅读的代码中它是什么我什么时候需要使用这个构造这是 Microsoft 对 C 语言的特定扩展它允许您使用存储类信息来赋予类型或函数属性文档 declspec C https learn
是否可以在 C# 中强制接口实现为虚拟？

我今天遇到了一个问题试图重写尚未声明为虚拟的接口方法的实现在这种情况下我无法更改接口或基本实现而必须尝试其他方法但我想知道是否有一种方法可以强制类使用虚拟方法实现接口 Example interface IBuilder
错误：无效使用不完整类型“类 Move”/未定义对 Move::NONE 的引用

拜托我不知道为什么这个简单的代码被拒绝它给了我 2 个编译错误请帮帮我 I use 代码块 20 03 我的编译器是GNU GCC 移动 hpp class Move public Move Move int int public

随机推荐

jQuery：按类和输入类型选择

我想使用 jQuery 选择一组既具有特定输入类型例如复选框又具有特定类的元素但是当我尝试以下操作时 input checkbox myClass 我没有收到任何退回的物品我怎样才能在 jQuery 中完成这个任务您的选择器正在
在聚合框架 C# 中使用 Facet

我想对我的数据创建一个聚合以获取 Net 应用程序中书籍集合的特定标签的总计数我有以下书籍课程 public class Book public string Id get set public string Name get set
带向量极限的四边形

我想使用quad作为限制列表没有 for 循环作为一个基本示例 T 1 2 3 f x x 2 quad 0 T 1 f 计算我需要的内容但我想将quad 0 T 1 f quad 0 T 2 f quad 0 T 3 f 保存为向量
在本机反应中更新/更改状态对象的最佳方法？

更新 State 对象深处的嵌套属性的最佳方法是什么 constructor this state someprop quadrangle rectangle width 我想更新矩形对象的宽度 this state quadrangle
Xcode 没有嵌入框架部分

我有问题我正在尝试在我的 ios xcode 项目中实现 Amazon 框架并且我还需要将它们添加到构建阶段 gt 嵌入框架部分中但我的 xcode 窗口中没有选项这是截图这怎么可能即使我创建新项目问题仍然存在您好在您
与内联块未对齐（其他元素被推下）

我正在尝试将小盒子排成一行这些盒子每个里面有大约 2 个元素在某些情况下第一个元素的文本太多以至于它分成两行如果发生这种情况该特殊行中的所有其他块如下所示长话短说这是一个例子 http jsfiddle net PMRQ5
如何在 JOptionPane 的 ok 按钮上添加监听器？ [复制]

这个问题在这里已经有答案了如何在单击确定按钮时添加侦听器JOptionPane INFORMATION MESSAGE 我的 JOptionPane 是 JOptionPane showMessageDialog null Your
Xbox One 控制器输入到 UWP 应用程序

我一直在尝试使 Xbox One 控制器与 UWP 应用程序交互并研究了 Gamepad 类基于评论中提到的建议 Windows UWP 中对 Xbox One 的控制器支持 https stackoverflow com questi
ExecutorService，避免任务队列太满的标准方法

我在用ExecutorService为了方便并发多线程程序采取以下代码 while xxx ExecutorService exService Executors newFixedThreadPool NUMBER THREADS Fut
EF 7 - 新的 ExecuteDelete 和 ExecuteUpdate 方法不适用于内存数据库

我正在使用新的 EF 7ExecuteDelete and ExecuteUpdate功能而且它们都很棒但是当我尝试为使用它们的函数编写单元测试时这些测试崩溃了我在 NET Core 7 上使用 EF 7 0 1 Microsoft
auto* 的类型推导规则是什么？

类型推导规则是什么auto 考虑以下 int x 64 int px x auto v1 x auto gt ok v1 is int auto v2 px auto gt is v2 int auto v3 px auto gt is v
PHP 中的 Twitter 机器人有问题吗？

我已经用 php 构建了一个 Twitter 机器人它能够接收消息并响应消息但出现了这个问题当我向机器人发送消息时我必须刷新机器人脚本才能让机器人回复我希望机器人能够不断检查任何新传入的消息并做出相应的响应我该如何修复这个错误
复制/移动省略与显式删除的复制/移动构造函数

我想知道复制移动省略何时适用或允许适用显式deleted 复制移动构造函数和非deleted 复制移动构造函数具体如下可以明确地deleted 复制 ctor 或移动 ctor 被删除是否尝试从另一个相同类型的对象或临时对象
沙盒应用程序和 NSOpenPanel 导致崩溃

我正在我的 Cocoa 应用程序中做一个简单的文件打开面板我启用权利和应用程序沙箱但在 OS X 10 9 上当应用程序应使用以下命令打开对话框时NSOpenPanel 它崩溃了应用具体信息由于未捕获的异常 NSObjectNot
如何在 XSLT 1.0 中查找当前日期

我在检索 XSLT 代码中的当前日期时遇到麻烦我正在使用 1 0 版和 MSXSL exe 应用程序来触发我的 xslt 代码我尝试使用以下代码行来实现此功能但它不起作用貌似1 0版本不支持当前日期功能您能否提供适用于 xslt
在 JavaScript 中检查文本框值是字符串还是数字

基本上我有以下代码
ld：重复符号

我正在做一个学校项目我从 Xcode 中收到一些奇怪的错误我正在使用 TextMate 的 Command R 功能来编译该项目编译似乎工作正常但链接失败并出现我不明白的错误消息 ld输出 ld path final build f
在 Kivy 中创建动态绘制的线条

这是我的帖子的延续在 Kivy 中使用和移动小部件按钮 https stackoverflow com questions 25273046 using and moving widgets buttons in kivy 我想在 Ki
如何在 statefulset 中设置 kubernetes pod 的主机名

我正在使用 Statefulset 并且启动了多个 Pod 但它们不是彼此的复制品我想设置 pod 的主机名并将这些主机名作为环境变量传递给所有 pod 以便它们可以相互通信我尝试在 pod 规范下使用主机名但主机名永远不会设置为指
CUDA 共享内存问题（以及将 CUDA 与 python/ctypes 一起使用）

不知怎的当我修改时d updated water flow map在下面的代码中 d terrain height map也被修改相反更改两个数组的分配顺序可以解决问题但我认为这只是掩盖了问题的根本原因 cudaCheck cuda

CUDA 共享内存问题（以及将 CUDA 与 python/ctypes 一起使用）

CUDA 共享内存问题（以及将 CUDA 与 python/ctypes 一起使用） 的相关文章

随机推荐

热门标签

CUDA 共享内存问题（以及将 CUDA 与 python/ctypes 一起使用）的相关文章