(抱歉我之前的回答有失偏颇)
你正在通过一个row
指向该函数的指针:
void Pascal_Triangle(int n_row, int * row) {
然后,您尝试用新值覆盖该指针:
row = new int[n_row];
一旦你从这个函数返回,row
在调用环境中将不会被修改。 (这是一个普通的 C/C++ 问题,并非 CUDA 特有的问题。)
这或许是一个令人困惑的问题,但指针值 of row
已通过by value到函数Pascal_Triangle
。您不能修改函数中的指针值,并期望修改后的值显示在调用环境中。 (你can修改指针指向的位置的内容,这是传递的常见原因row
通过指针。)
有几种方法可以解决这个问题。最简单的可能只是通过引用传递指针:
void Pascal_Triangle(int n_row, int * &row) {
您的代码似乎还有其他缺陷。我建议你雇用正确的cuda错误检查 https://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api并运行你的代码cuda-memcheck
.
特别是,内核中new
运算符的行为方式与内核中类似malloc
,并且它有类似的限制 http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#dynamic-global-memory-allocation-and-operations.
您的设备堆空间即将耗尽,因此您的许多new
操作失败,并返回 NULL 指针。
作为对此的测试,在您的代码之后放置这样的行是很好的调试实践new
手术:
if (row == NULL) assert(0);
(您还需要包括assert.h
)
如果你这样做,你会发现这个断言被命中了。
我还没有计算出您的代码实际需要多少设备堆空间,但它似乎使用了相当多的空间。在C++中,习惯上是delete
的分配由new
一旦你完成了它。您可能想研究释放使用以下命令完成的分配new
,或者(甚至更好)重新使用分配(即每个线程分配一次),并完全避免重新分配。
这是对代码的修改,演示了上述内容(每个线程一个分配),并且编译和运行对我来说没有错误:
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
__device__ double *d_droot, *d_dweight;
/*How could be returned the array or the pointer to the array int *row, on the device, that is filled by this function? */
__device__
void Pascal_Triangle(int n_row, int *row) {
int a[100][100];
int i, j;
//first row and first coloumn has the same value=1
for (i = 1; i <= n_row; i++) {
a[i][1] = a[1][i] = 1;
}
//Generate the full Triangle
for (i = 2; i <= n_row; i++) {
for (j = 2; j <= n_row - i; j++) {
if (a[i - 1][j] == 0 || a[i][j - 1] == 0) {
break;
}
a[i][j] = a[i - 1][j] + a[i][j - 1];
}
}
for (i = 1; i <= n_row; i++) {
row[i] = a[i][n_row-1];
}
}
__device__
double Legendre_poly(int order, double x, int *my_storage)
{
int n,k;
double val=0;
int *binomials = my_storage;
if (binomials == NULL) assert(0);
for(n=order; n>=0; n--)
{
Pascal_Triangle(n, binomials); /*Here are the problems*/
for(k=0; k<=n; k++)
val += binomials[k]*pow(x-1,n-k)*pow(x-1,k);
}
return val;
}
__device__ __host__
double f(double alpha,double x)
{
/*function expanded on a basis of Legendre palynomials. */
return exp(-alpha*x*x);
}
/*Kernel that computes the expansion by quadratures*/
__global__ void Span(int n, double alpha, double a, double b, double *coefficients)
{
/*
Parameters:
n: Total number of expansion coeficients
a: Upper integration limit
b: Lower integration limit
d_droots[]: roots for the quadrature
d_dweight[]: weights for the quadrature
coefficients[]: allocate N expansion coefficients.
*/
double c1 = (b - a) / 2, c2 = (b + a) / 2, sum = 0;
int dummy;
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n)
{
#define MY_LIM 5
int *thr_storage = new int[MY_LIM];
if (thr_storage == NULL) assert(0);
coefficients[i] = 0.0;
for (dummy = 0; dummy < MY_LIM; dummy++)
coefficients[i] += d_dweight[dummy] * f(alpha,c1 * d_droot[dummy] + c2)*Legendre_poly(dummy,c1 * d_droot[dummy] + c2, thr_storage)*c1;
delete thr_storage;
}
}
int main(void)
{
cudaDeviceSetLimit(cudaLimitMallocHeapSize, (1048576ULL*1024));
int N = 1<<23;
int N_nodes = 5;
double *droot, *dweight, *dresult, *d_dresult, *d_droot_temp, *d_dweight_temp;
/*double version in host*/
droot =(double*)malloc(N_nodes*sizeof(double));
dweight =(double*)malloc(N_nodes*sizeof(double));
dresult =(double*)malloc(N*sizeof(double)); /*will recibe the results of N quadratures!*/
/*double version in device*/
cudaMalloc(&d_droot_temp, N_nodes*sizeof(double));
cudaMalloc(&d_dweight_temp, N_nodes*sizeof(double));
cudaMalloc(&d_dresult, N*sizeof(double)); /*results for N quadratures will be contained here*/
/*double version of the roots and weights*/
droot[0] = 0.90618;
droot[1] = 0.538469;
droot[2] = 0.0;
droot[3] = -0.538469;
droot[4] = -0.90618;
dweight[0] = 0.236927;
dweight[1] = 0.478629;
dweight[2] = 0.568889;
dweight[3] = 0.478629;
dweight[4] = 0.236927;
/*double copy host-> device*/
cudaMemcpy(d_droot_temp, droot, N_nodes*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_dweight_temp, dweight, N_nodes*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(d_droot, &d_droot_temp, sizeof(double *));
cudaMemcpyToSymbol(d_dweight, &d_dweight_temp, sizeof(double *));
// Perform the expansion
Span<<<(N+255)/256, 256>>>(N,1.0, -3.0, 3.0, d_dresult); /*This kerlnel works OK*/
cudaMemcpy(dresult, d_dresult, N*sizeof(double), cudaMemcpyDeviceToHost);
cudaFree(d_dresult);
cudaFree(d_droot_temp);
cudaFree(d_dweight_temp);
}
这段代码有几个优点:
- 它可以在设备堆上以更小的保留运行
- 它比您的代码尝试执行的大量分配要快得多。
EDIT:
而不是assert
你可以这样做:
/*Kernel that computes the expansion by quadratures*/
__global__ void Span(int n, double alpha, double a, double b, double *coefficients)
{
/*
Parameters:
n: Total number of expansion coeficients
a: Upper integration limit
b: Lower integration limit
d_droots[]: roots for the quadrature
d_dweight[]: weights for the quadrature
coefficients[]: allocate N expansion coefficients.
*/
double c1 = (b - a) / 2, c2 = (b + a) / 2, sum = 0;
int dummy;
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n)
{
#define MY_LIM 5
int *thr_storage = new int[MY_LIM];
if (thr_storage == NULL) printf("allocation failure!\");
else {
coefficients[i] = 0.0;
for (dummy = 0; dummy < MY_LIM; dummy++)
coefficients[i] += d_dweight[dummy] * f(alpha,c1 * d_droot[dummy] + c2)*Legendre_poly(dummy,c1 * d_droot[dummy] + c2, thr_storage)*c1;
delete thr_storage;
}
}
}