I'm working on a project where I need my CUDA device to make computations on a struct containing pointers.
typedef struct StructA {
int* arr;
} StructA;
When I allocate memory for the struct and then copy it to the device, it will only copy the struct and not the content of the pointer. Right now I'm working around this by allocating the pointer first, then set the host struct to use that new pointer (which resides on the GPU). The following code sample describes this approach using the struct from above:
#define N 10
int main() {
int h_arr[N] = {1,2,3,4,5,6,7,8,9,10};
StructA *h_a = (StructA*)malloc(sizeof(StructA));
StructA *d_a;
int *d_arr;
// 1. Allocate device struct.
cudaMalloc((void**) &d_a, sizeof(StructA));
// 2. Allocate device pointer.
cudaMalloc((void**) &(d_arr), sizeof(int)*N);
// 3. Copy pointer content from host to device.
cudaMemcpy(d_arr, h_arr, sizeof(int)*N, cudaMemcpyHostToDevice);
// 4. Point to device pointer in host struct.
h_a->arr = d_arr;
// 5. Copy struct from host to device.
cudaMemcpy(d_a, h_a, sizeof(StructA), cudaMemcpyHostToDevice);
// 6. Call kernel.
kernel<<<N,1>>>(d_a);
// 7. Copy struct from device to host.
cudaMemcpy(h_a, d_a, sizeof(StructA), cudaMemcpyDeviceToHost);
// 8. Copy pointer from device to host.
cudaMemcpy(h_arr, d_arr, sizeof(int)*N, cudaMemcpyDeviceToHost);
// 9. Point to host pointer in host struct.
h_a->arr = h_arr;
}
My question is: Is this the way to do it?
It seems like an awful lot of work, and I remind you that this is a very simple struct. If my struct contained a lot of pointers or structs with pointers themselves, the code for allocation and copy will be quite extensive and confusing.