I am using Cuda with C++ to do some parallel computing. Recently, I noticed something that I cannot understand and I didn't find informations about it when looking for it. In my code, one line which is very seldom exectued (but needed) slows down the program even when it is not executed at all. Here is some code to make it more clear:
The class I created:
class Foo
{
void myFunction(Foo *listFoo);
//some other functions that I need
...
int myAttribute;
//some other attributes that I need
...
}
The definition of myFunction:
void Foo::myFunction(Foo *listFoo)
{
//do some computations on the listFoo
if( condition seldom verified )
{ myAttribute = myAttribute + 1; }
}
The global function:
__global__ void compute(Foo *listFoo, int numberOfFoo)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
if( i < numberOfFoo)
{ listFoo[i].myFunction(listFoo); }
}
The host code:
compute<<<(numberOfFoo + 511)/512, 512>>> (listFoo, numberOfFoo)
The line slowing down everything is myAttribute = myAttribute + 1. Even when it is executed 0 times, the code is really slow compared to when the line is put in the comment. I tried to replace this line with a simple printf. The result is the same, the line is never executed but it slows down everything.
If you have any suggestion on the reason and on eventually how to solve this problem, it would be very much appreciated. My level in programing is not so advanced, so please use relatively easy explanations.
Thanks a lot
First Edit: few people requested the code, so here it is! I reduced it to 700 lines, I know it is still very long but not much would work if I keep removing some parts of it. It compiles without problems for me. All you have to do is press enter, wait few seconds and the time taken will be shown in the command window.
It is in the function findContactwithGrain() that the problem occurs. The line addContact(grainContact) is slowing down everything. On my computer, if this line is active, one computation takes around 3.5 sec. If I put it in comment, it takes 0.07 sec. That's a huge difference for one line that is never executed.
Hope this helps to understand the problem
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <fstream> // to read and write files
#include <stdio.h>
#include <iostream>
#include <time.h>
#include <string>
#include <sstream>
#define n 200
using namespace std;
int global_totalNumberBlock = 0;
int global_totalNumberGrain = 0;
//tell the compiler that those classes exist
class Vec3d2;
class Block;
class Grain;
class Contact;
class Analysis;
class Vec3d2
{
public:
__host__ __device__ Vec3d2(void);
__host__ __device__ Vec3d2(double x_value, double y_value, double z_value);
__host__ __device__ ~Vec3d2(void);
__host__ __device__ double dot(Vec3d2 a) const;
__host__ __device__ Vec3d2 cross(Vec3d2 a) const;
__host__ __device__ double norm() const;
__host__ __device__ void normalize();
// to be able to use cout easily
__host__ __device__ friend ostream & operator <<(ostream &s,const Vec3d2 &vec)
{
s << vec.x << endl;
s << vec.y << endl;
s << vec.z << endl;
return s;
}
//to be able to use brackets
__host__ __device__ double operator [](int i) const
{
if( i == 0)
{
return x;
}
else if( i == 1)
{
return y;
}
else if( i == 2)
{
return z;
}
else
{
cout << "ERROR IN USING VEC3D2" << endl;
system("PAUSE");
}
}
__host__ __device__ double & operator [](int i)
{
if( i == 0)
{
return x;
}
else if( i == 1)
{
return y;
}
else if( i == 2)
{
return z;
}
else
{
cout << "ERROR IN USING VEC3D2" << endl;
system("PAUSE");
}
}
//attributes
double x, y, z;
};
//Class Vec3d2 functions and operators
Vec3d2::Vec3d2()
{
x = 0;
y = 0;
z = 0;
}
Vec3d2::Vec3d2(double x_value, double y_value, double z_value)
{
x = x_value;
y = y_value;
z = z_value;
}
Vec3d2::~Vec3d2()
{
}
double Vec3d2::dot(Vec3d2 a) const
{
return x*a.x + y*a.y + z*a.z;
}
Vec3d2 Vec3d2::cross(Vec3d2 a) const
{
Vec3d2 result( y*a.z - z*a.y, x*a.z - z*a.x, x*a.y - y*a.x);
return result;
}
double Vec3d2::norm() const
{
return sqrt((double) x*x + y*y + z*z);
}
void Vec3d2::normalize()
{
double norm = this->norm();
if (norm > 0)
{
x = x/norm;
y = y/norm;
z = z/norm;
}
else //the vector has a null norm so nothing to do
{
}
}
__host__ __device__ Vec3d2 operator+(Vec3d2 const& a, Vec3d2 const& b)
{
return Vec3d2(a.x + b.x, a.y + b.y, a.z + b.z);
}
__host__ __device__ Vec3d2 operator-(Vec3d2 const& a, Vec3d2 const& b)
{
return Vec3d2(a.x - b.x, a.y - b.y, a.z - b.z);
}
__host__ __device__ Vec3d2 operator*(Vec3d2 const& a, double const& b)
{
return Vec3d2(b*a.x, b*a.y, b*a.z);
}
__host__ __device__ Vec3d2 operator*(double const& b, Vec3d2 const& a)
{
return Vec3d2(b*a.x, b*a.y, b*a.z);
}
__host__ __device__ Vec3d2 operator/(Vec3d2 const& a, double const& b)
{
return Vec3d2(a.x/b, a.y/b, a.z/b);
}
__host__ __device__ Vec3d2 operator/(double const& b, Vec3d2 const& a)
{
return Vec3d2(a.x/b, a.y/b, a.z/b);
}
__host__ __device__ bool operator==(Vec3d2 const& a, Vec3d2 const& b)
{
if(a.x == b.x && a.y == b.y && a.z == b.z)
{
return true;
}
else
{
return false;
}
}
__host__ __device__ bool operator!=(Vec3d2 const& a, Vec3d2 const& b)
{
if( a.x != b.x || a.y != b.y || a.z != b.z)
{
return true;
}
else
{
return false;
}
}
class Contact
{
public:
__host__ __device__ Contact(void);
//__host__ __device__ Contact(Contact const& ContactToCopy);
__host__ __device__ ~Contact(void);
__host__ __device__ void setContact(Grain &grain1, Grain &grain2, double overlap_value);
};
class Block
{
public:
__host__ Block(void);
__host__ Block(Block const& BlockToCopy);
__host__ __device__ ~Block(void);
__host__ __device__ Contact* getContactList() const;
__host__ __device__ Contact** getContactListPtr();
__host__ __device__ int getMaxNumberContact() const;
__host__ __device__ int getNumberContact() const;
__host__ __device__ void setContactList(Contact *ptr);
__host__ __device__ void addContact(Contact contact_value);
__host__ __device__ void clearContactList();// empty the contactList
__host__ __device__ void deleteBlockData(); //clear the memory taken by the contactList
__host__ __device__ Block& operator=(Block const& BlockToCopy);
protected:
int Id; //unique Id number for each entity double mass;
int totalNumberBlock; //same value for each block, cannot use static attribute because of cuda
Contact *contactList;
int numberContact, old_numberContact; //because there is no way to find it from the pointer contactList
int maxNumberContact; //maximum number of contact per block, we have to choose this
};
class Grain: public Block
{
public:
__host__ Grain(void);
__host__ Grain(Grain const& grainToCopy);
__host__ Grain(Vec3d2 position_value, double radius_value, double mass_value);
__host__ __device__ ~Grain(void);
__host__ __device__ Vec3d2 getPositionVec() const;
__host__ __device__ Vec3d2* getPosition() const;
__host__ __device__ Vec3d2** getPositionPtr();
__host__ __device__ int getTotalNumberGrain() const;
__host__ void setTotalNumberGrain();
__host__ __device__ void setTotalNumberGrain(int number);
__host__ __device__ void setPosition(Vec3d2 *ptr);
__host__ __device__ void setPositionVec(Vec3d2 position_value);
__host__ __device__ void deleteGrainData();
__host__ __device__ void findContactwithGrain(Grain *grainList);
__host__ __device__ Grain& operator=(Grain const& grainToCopy);
__host__ __device__ friend ostream & operator <<(ostream &s,const Grain &grain)
{
s <<"position is" << endl;
s << *grain.position << endl;
s <<"grain number is" << endl;
s << grain.number << endl;
s <<"radius is" << endl;
s << grain.radius << endl;
s <<"mass is" << endl;
return s;
}
private:
Vec3d2 *position;
int totalNumberGrain;
int number; //different from Id defined in class Block because a wall could have the same number as a grain
double radius;
};
class Analysis
{
public:
Analysis(void);
Analysis(Grain *grainList);
~Analysis(void);
Grain* getGrainList();
void copyToDevice();
void copyToHost();
void runAnalysis();
private:
//should contain grainList, wallList and their equivalent for the device
//should contain an array of pointers for each attribute being a pointer in grain and wall and their equivalent in the device
int totalNumberGrain, totalNumberWall;
Grain *grainList, *d_grainList;
//for grain data
Contact **grain_contactList, **d_grain_contactList;
Vec3d2 **grain_position, **d_grain_position;
};
//class Contact functions
Contact::Contact(void)
{
}
Contact::~Contact(void)
{
}
void Contact::setContact(Grain &grain1, Grain &grain2, double overlap_value)//we are in grain1 and contact with grain2
{
}
//class Block functions
Block::Block(void)
{
Id = global_totalNumberBlock;
numberContact = 0;
old_numberContact = 0;
//contact list settings
maxNumberContact = 30;
contactList = new Contact[maxNumberContact];
//increment of block number
global_totalNumberBlock = global_totalNumberBlock + 1;
}
Block::~Block(void)
{
delete[] contactList;
//cout << "CAREFUL, YOU ARE DESTROYING A BLOCK" << endl;//because we should never erase a block
//system("PAUSE");
totalNumberBlock = totalNumberBlock - 1;
}
Block::Block(Block const& BlockToCopy)
{
Id = BlockToCopy.Id;
numberContact = BlockToCopy.numberContact;
old_numberContact = BlockToCopy.old_numberContact;
maxNumberContact = BlockToCopy.maxNumberContact;
contactList = new Contact[maxNumberContact];
for(int i =0; i <numberContact; i++)
{
contactList[i] = BlockToCopy.contactList[i];
}
}
Contact* Block::getContactList() const
{
return contactList;
}
Contact** Block::getContactListPtr()
{
return &contactList;
}
int Block::getMaxNumberContact() const
{
return maxNumberContact;
}
int Block::getNumberContact() const
{
return numberContact;
}
void Block::setContactList(Contact *ptr)
{
//no "delete contactList" here because this is executed after cuda. The contactList is pointing to nothing and deleteing it will cause an error
contactList = ptr;
}
void Block::addContact(Contact contact_value)
{
if(numberContact < maxNumberContact)
{
contactList[numberContact] = contact_value;
numberContact = numberContact + 1;
}
else //find a way to throw an error because the list is too small for all the contacts
{
printf("TOO MANY CONTACTS ON ONE GRAIN");
}
}
void Block::clearContactList()
{
//delete[] contactList;
//contactList = new Contact[maxNumberContact];
if(numberContact > 0)
{
numberContact = 0;
}
}
void Block::deleteBlockData()
{
delete[] contactList;
}
__host__ __device__ Block& Block::operator=(Block const& BlockToCopy)
{
if(this != &BlockToCopy) //to check we are not doing a = a
{
Id = BlockToCopy.Id;
numberContact = BlockToCopy.numberContact;
old_numberContact = BlockToCopy.old_numberContact;
maxNumberContact = BlockToCopy.maxNumberContact;
delete[] contactList;
contactList = new Contact[maxNumberContact];
for(int i =0; i <numberContact; i++)
{
contactList[i] = BlockToCopy.contactList[i];
}
}
return *this;
}
//class Grain functions
Grain::Grain(void)
{
number = global_totalNumberGrain;
global_totalNumberGrain = global_totalNumberGrain + 1;
totalNumberGrain = -1;//safety
//initialize Vec3d2
position = new Vec3d2;
}
Grain::Grain(Grain const& grainToCopy)
{
cout <<"COPY CONSTRUCTOR OF GRAIN IS NOT DONE YET"<<endl;
system("PAUSE");
//totalNumberGrain = grainToCopy.totalNumberGrain;
//radius = grainToCopy.radius;
//diameter = grainToCopy.diameter;
//volume = grainToCopy.volume;
//inertia = grainToCopy.inertia;
//position = new Vec3d2;
//old_position = new Vec3d2;
//old_velocity = new Vec3d2;
//old_acceleration = new Vec3d2;
//old_angularVelocity = new Vec3d2;
//old_angularAcceleration = new Vec3d2;
//gravityForce = new Vec3d2;
//*position = *grainToCopy.position;
//*old_position = *grainToCopy.old_position;
//*old_velocity = *grainToCopy.old_velocity;
//*old_acceleration = *grainToCopy.old_acceleration;
//*old_angularVelocity = *grainToCopy.old_angularVelocity;
//*old_angularAcceleration = *grainToCopy.old_angularAcceleration;
//*gravityForce = *grainToCopy.gravityForce;
}
Grain::Grain(Vec3d2 position_value, double radius_value,double mass_value)//, number(totalNumberGrain)
{
number = global_totalNumberGrain;
global_totalNumberGrain = global_totalNumberGrain + 1;
totalNumberGrain = -1;//safety
radius = radius_value;
//initialize all the Vec3d2 parameters
position = new Vec3d2;
*position = position_value;
}
Grain::~Grain(void)
{
//cout << "CAREFUL, YOU ARE DESTROYING A GRAIN" << endl;//because we should never erase a block
//system("PAUSE");
totalNumberGrain = totalNumberGrain - 1;
delete position;
}
Vec3d2 Grain::getPositionVec() const
{
return *position;
}
Vec3d2* Grain::getPosition() const
{
return position;
}
Vec3d2** Grain::getPositionPtr()
{
return &position;
}
int Grain::getTotalNumberGrain() const
{
return totalNumberGrain;
}
void Grain::setTotalNumberGrain()
{
totalNumberGrain = global_totalNumberGrain;
}
void Grain::setTotalNumberGrain(int number)
{
totalNumberGrain = number;
}
void Grain::setPosition(Vec3d2 *ptr)
{
position = ptr;
}
void Grain::setPositionVec(Vec3d2 position_value)
{
*position = position_value;
}
void Grain::deleteGrainData()
{
delete position;
}
void Grain::findContactwithGrain(Grain *grainList)
{
for(int m = 0; m < n; m++)
{
double length;
length = (*position - (*grainList[m].position)).norm();
if( length < radius + grainList[m].radius)
{
if( number != grainList[m].number) //faster than number != sortedGrainList[m]
{
Vec3d2 relativePosition = *position - (*grainList[m].position) ;
double overlap = radius + grainList[m].radius - relativePosition.norm();
//define the contact
Contact grainContact;
grainContact.setContact(*this, grainList[m], overlap);
addContact(grainContact); //IF YOU PUT THIS LINE IN COMMENT, EVERYTHING GOES A LOT FASTER
}
}
}
}
__host__ __device__ Grain& Grain::operator=(Grain const& grainToCopy)
{
if(this != &grainToCopy)
{
Block::operator=(grainToCopy); //this lines call the operator = defined for Block. So it copies the block attributes of the first grain into the second grain
//totalNumberGrain = grainToCopy.totalNumberGrain;
radius = grainToCopy.radius;
*position = *grainToCopy.position;
}
return *this;
}
//class Analysis functions
Analysis::Analysis(void)
{
}
Analysis::Analysis(Grain *grainList_value)
{
totalNumberGrain = grainList_value[0].getTotalNumberGrain();
grainList = new Grain[totalNumberGrain];
//copy grains
for(int i = 0; i < totalNumberGrain; i++)
{
grainList[i] = grainList_value[i];
grainList[i].setTotalNumberGrain(grainList_value[i].getTotalNumberGrain());
}
}
Analysis::~Analysis(void)
{
delete[] grainList;
//a lot more delete should be made here
}
Grain* Analysis::getGrainList()
{
return grainList;
}
void Analysis::copyToDevice()
{
//declare device grainList and wallList and copy the values
cudaMalloc(&d_grainList, totalNumberGrain*sizeof(Grain));
cudaMemcpy(d_grainList, grainList, totalNumberGrain*sizeof(Grain), cudaMemcpyHostToDevice);
////declare device list of pointer to pass pointer values of grain
d_grain_contactList = new Contact*[totalNumberGrain];
d_grain_position = new Vec3d2*[totalNumberGrain];
for(int i = 0; i < totalNumberGrain; i++)
{
cudaMalloc(&d_grain_contactList[i], grainList[i].getMaxNumberContact()*sizeof(Contact));
cudaMalloc(&d_grain_position[i], sizeof(Vec3d2));
}
//copy pointers and values for grains
for(int i = 0; i < totalNumberGrain; i++)
{
//pointers
cudaMemcpy(d_grainList[i].getContactListPtr(), &d_grain_contactList[i], sizeof(Contact*), cudaMemcpyHostToDevice);
cudaMemcpy(d_grainList[i].getPositionPtr(), &d_grain_position[i], sizeof(Vec3d2*), cudaMemcpyHostToDevice);
//values
cudaMemcpy(d_grain_contactList[i], grainList[i].getContactList(), grainList[i].getMaxNumberContact()*sizeof(Contact), cudaMemcpyHostToDevice);
cudaMemcpy(d_grain_position[i], grainList[i].getPosition(), sizeof(Vec3d2), cudaMemcpyHostToDevice);
}
}
void Analysis::copyToHost()
{
//delete the pointer value or it will create a memory leak
for(int i = 0; i < totalNumberGrain; i++)
{
grainList[i].deleteBlockData();
grainList[i].deleteGrainData();
}
//copy non pointer value
cudaMemcpy(grainList, d_grainList, totalNumberGrain*sizeof(Grain),cudaMemcpyDeviceToHost);
//copy pointer values for grains
grain_contactList = new Contact*[totalNumberGrain];
grain_position = new Vec3d2*[totalNumberGrain];
for(int i = 0; i < totalNumberGrain; i++)
{
grain_contactList[i] = new Contact[grainList[i].getMaxNumberContact()];
grain_position[i] = new Vec3d2;
grainList[i].setContactList(grain_contactList[i]);
grainList[i].setPosition(grain_position[i]);
cudaMemcpy(grain_contactList[i], d_grain_contactList[i], grainList[i].getMaxNumberContact()*sizeof(Contact), cudaMemcpyDeviceToHost);
cudaMemcpy(grain_position[i], d_grain_position[i], sizeof(Vec3d2), cudaMemcpyDeviceToHost);
}
}
__global__ void compute( Grain *g)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
//__syncthreads();
if( i < n )
{
g[i].findContactwithGrain(g);
}
}
void Analysis::runAnalysis()
{
for(int i = 0; i < 3; i ++)
{
clock_t begin = clock();
for(int j = 0; j < 10000; j++)
{
compute<<<(n + 511)/512, 512>>>(d_grainList);
}
clock_t end = clock();
cout << (double)(end-begin)/CLOCKS_PER_SEC << endl;
system("PAUSE");
}
}
int main(void)
{
//grain
Vec3d2 position1; position1[0] = 0;position1[1] = 0;position1[2] = 0;
double radius1 = 1;
////cuda
cout << "PRESS ENTER TO START" << endl;
system("PAUSE");
clock_t begin = clock();
Grain *g, *d_g;
g = new Grain[n];
for(int i = 0; i<n; i++)
{
g[i].setTotalNumberGrain();
}
Grain g1(position1, radius1, 0.1);
for(int i = 0; i <n; i++)
{
g[i] = g1;
g[i].setPositionVec(Vec3d2(3*i+1.5, 1.5, 0));
}
Analysis a(g);
a.copyToDevice();
a.runAnalysis();
clock_t end = clock();
cout << (double)(end-begin)/CLOCKS_PER_SEC << endl;
return 0;
}