0

If one wants to copy the arrays to device from host one does cudamalloc and cudaMemcpy. But to lessen the hassle one just does cudaMallocManaged without the former two things and life was never simpler before. The code looks like this(more or less)

__global__ void convert(float kelvin[], float celsius[])  //can pass 
arrays in kernel
{
     int i = blockIdx.x * blockDim.x + threadIdx.x;
  if (i<N)
    kelvin[i]=celsius[i]+273.15;
}

int main()
{
    float *celsius =(float *)malloc(N*sizeof(float));
    float *kelvin =(float *)malloc(N*sizeof(float));
    cudaMallocManaged(&celsius, N*sizeof(float));
    cudaMallocManaged(&kelvin, N*sizeof(float));

// init celsius here

dim3 blocksPerGrid(1,1,1); //use only one block
dim3 threadsPerBlock(N,1,1); //use N threads in the block
convert<<<blocksPerGrid, threadsPerBlock>>>(kelvin,celsius);
cudaDeviceSynchronize();

//Doing stuff with the output here

return 0;
}

The previous example seems clear to me. But, how to do cudaMallocManaged for 2D and 3D array? I've been trying

__global__ void MatAdd(float A[N][N], float B[N][N], float C[N][N])
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i < N && j < N)
C[i][j] = A[i][j] + B[i][j];
}

int main()
{   // I thonk, 2D arrays can be passed as pointer to pointers
    float **A = (float **)malloc(N*N*sizeof(float));
    float **B = (float **)malloc(N*N*sizeof(float));
    float **C = (float **)malloc(N*N*sizeof(float));
    cudaMallocManaged(&A, N*N*sizeof(float));
    cudaMallocManaged(&B, N*N*sizeof(float));
    cudaMallocManaged(&C, N*N*sizeof(float));


A[N][N]={{1,0,0},{0,1,0},{0,0,1}};
B[N][N]={{1,0,0},{0,1,0},{0,0,1}};
dim3 threadsPerBlock(16, 16);
dim3 numBlocks(N / threadsPerBlock.x, N / threadsPerBlock.y);
MatAdd<<<numBlocks, threadsPerBlock>>>(A, B, C);
//outputs and all

}

But, It shows the following error

matrix_add.cu(22): error: too many initializer values

matrix_add.cu(25): error: argument of type "float **" is incompatible with parameter of type "float (*)[3]"

Your help is highly appreciated.

Galilean
  • 217
  • 1
  • 11
  • "2D arrays" can't be passed as pointer to pointers. And why is everything allocated twice? – talonmies Jul 12 '18 at 05:57
  • @talonmies , as I have allocated memory in CPU I'll have to allocate it in cuda device also. Here the cudaMallocManaged allocates same memory in GPU. At least I think so – Galilean Jul 12 '18 at 06:00
  • You think wrongly. The malloc calls are unnecessary and incorrect, as is literally everything else in your code. I recommend you re-read everything in the answer to your last question, because you clearly didn't understand what you were told there. – talonmies Jul 12 '18 at 06:04

1 Answers1

0

You got a lot wrong in your attempt, so much that it was faster to write a working version than list out all the individual problems in the code in your question. So here is a working version of what it appears you were trying to do:

#include <algorithm>
#include <iostream>

const int N = 3;

__global__ void MatAdd(float A[][N], float B[][N], float C[][N])
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    int j = blockIdx.y * blockDim.y + threadIdx.y;
    if (i < N && j < N)
        C[i][j] = A[i][j] + B[i][j];
}

int main()
{
    float* A; cudaMallocManaged(&A, N*N*sizeof(float));
    float* B; cudaMallocManaged(&B, N*N*sizeof(float));
    float* C; cudaMallocManaged(&C, N*N*sizeof(float));

    const float A_vals[N][N]={{1,0,0},{0,1,0},{0,0,1}};
    const float B_vals[N][N]={{1,0,0},{0,1,0},{0,0,1}};
    float (*C_vals)[N] = reinterpret_cast<float (*)[N]>(C);

    std::copy(&A_vals[0][0], &A_vals[0][0] + N*N, A);
    std::copy(&B_vals[0][0], &B_vals[0][0] + N*N, B);

    dim3 threadsPerBlock(16, 16);
    dim3 numBlocks(1, 1);
    MatAdd<<<numBlocks, threadsPerBlock>>>( reinterpret_cast<float (*)[N]>(A),
                                            reinterpret_cast<float (*)[N]>(B),
                                            C_vals );

    cudaDeviceSynchronize();

    for(int i=0; i<N; i++) {
        for(int j=0; j<N; j++) {
            std::cout << C_vals[i][j] << "  ";
        }
        std::cout << std::endl;
    }

    return 0;
}

Some important points:

  1. Managed memory allocation replaces standard host memory allocation and produces memory which is directly accessible on both the host and the device.
  2. All arrays decay to a pointer when passed as arguments to a function by value. That decay is not recursive. See here for more details.
  3. You can (and will need to) cast in order to use the [][] access syntax on linear memory allocated dynamically at runtime (this applies to malloc, new, or any of the CUDA host memory allocation APIs. See here for more details).
  4. Initialization syntax and assignment syntax for arrays are not interchangeable.

All I can suggest is that you study it thoroughly until you understand how it works.

talonmies
  • 67,081
  • 33
  • 170
  • 244
  • Sir, Thanks for your elaborated answer regarding this question. I surely don't have much of a knowledge when it comes to the memory allocation part of the program. I am a Physics major student and a python programmer so my path doesn't cross with the memory elements. This summer I picked up CUDA for my simulation project and did some huge mistakes along the way. Nevertheless, there is always a learning curve. – Galilean Jul 12 '18 at 09:44