0

I have this functions, the result is correct but the compiler don't vectorize this. How can I achive that the compiler vectorize this and how can I optimize this codes?

void LongNumSet( char *L, unsigned N, char digit )
{
    for (int i = 0; i < N; ++i){
      L[i] = digit;
   }
}

void LongNumCopy( char *Vin, char *Vout, unsigned N )
{
  for ( int i=0; i< N; ++i )
  {
    Vout[i] = Vin[i];
  }
}

char LongNumAddition( char *__restrict Vin1, char * __restrict Vin2, char * __restrict Vout, unsigned N )
{
  char CARRY = 0,R,aux;

  Vin1 = (char*)__builtin_assume_aligned (Vin1, 1);
  Vin2 = (char*)__builtin_assume_aligned (Vin2, 1);


  for ( int i=0; i< N; ++i )
  {

    char R = Vin1[i] + Vin2[i] + CARRY;
    aux = R <= 9;
    Vout[i] = (aux) ? R:R-ten;

    CARRY = (aux) ? 0:1;
  }
  return CARRY;
}
char LongNumAddDigit( char *V, char digit, unsigned N )
{
  int i=0;
  char R = V[0] + digit;

  if ( R < ten){
    V[0] = R;
    return 0;
  }
  V[0] = R-ten;

  // add carry, maybe iteratively for all digits
  char CARRY = 1;
  i = 1;

  while ( CARRY && i < N )
  {

    if ( V[i] < 9 )
    {
     V[i]++;
     CARRY = 0;
    }
    else
    {
      V[i] = 0;
      i++;  // CARRY remains set to 1
    }
  }
  return CARRY;
}

I use the comand gcc -O3 -ffast-math -msse -funroll-all-loops -ftree-vectorizer-verbose=25 -lm -g $1 -o ${2}.O3 and I executate the program in 55 s. This is all of code:

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>

// Variable used to generate pseudo-random numbers
unsigned int seed;
unsigned int temp;
unsigned int var1 = 214013;
unsigned int var2 = 2531011;
#define val13 13
#define ten 10
// Function to generate pseudo-random numbers
inline int myRandom() {
  temp = var1*seed;
  seed = temp + var2;
  return (seed>>val13);
}


void LongNumInit( char *L, unsigned N )
{
  for ( int i=0; i<N;++i )
  {
    L[i] = myRandom() % ten;  // digito decimal
  }
}

void LongNumPrint( char *L, unsigned N, char *Name )
{
  printf("%s:", Name);
  for ( int i=N; i>0; i-- )
  {
    printf("%d", L[i-1]);
  }
  printf("\n");
}


void LongNumSet( char *L, unsigned N, char digit )
{
    for (int i = 0; i < N; ++i){
      L[i] = digit;
   }
}


void LongNumCopy( char *Vin, char *Vout, unsigned N )
{
  for ( int i=0; i< N; ++i )
  {
    Vout[i] = Vin[i];
  }
}

char LongNumAddition( char *__restrict Vin1, char * __restrict Vin2, char * __restrict Vout, unsigned N )
{
  char CARRY = 0,R,aux;

  Vin1 = (char*)__builtin_assume_aligned (Vin1, 1);
  Vin2 = (char*)__builtin_assume_aligned (Vin2, 1);
  for ( int i=0; i< N; ++i )
  {

    char R = Vin1[i] + Vin2[i] + CARRY;
    aux = R <= 9;
    Vout[i] = (aux) ? R:R-ten;

    CARRY = (aux) ? 0:1;
  }
  return CARRY;
}


char LongNumAddDigit( char *V, char digit, unsigned N )
{
  int i=0;
  char R = V[0] + digit;

  if ( R < ten){
    V[0] = R;
    return 0;
  }
  V[0] = R-ten;

  // add carry, maybe iteratively for all digits
  char CARRY = 1;
  i = 1;

  while ( CARRY && i < N )
  {

    if ( V[i] < 9 )
    {
     V[i]++;
     CARRY = 0;
    }
    else
    {
      V[i] = 0;
      i++;  // CARRY remains set to 1
    }
  }
  return CARRY;
}


char LongNumHorizAdd( char *Vin, char *Vout, unsigned N )
{
  char CARRY = 0;
  LongNumSet ( Vout, N, 0 );
  for ( int i=0; i< N; ++i )
  {
    LongNumAddDigit ( Vout, Vin[i], N );

  }
  return 0; // CARRY can never be set
}
char LongNumConstMult( char *V, unsigned N, char digit )
{
  char CARRY = 0;
  char ja = 0;

  for ( int i=0; i< N; ++i )
  {
    char aux = V[i] * digit;
    char R = aux + CARRY;
    CARRY = ((u_int32_t)R * (u_int32_t)0xCCCD) >> 19;
    ja = (CARRY << 3) + 2*CARRY;
    R -= ja;
    V[i] = R;
  }
  return CARRY; // may be from 0 to 9
}
void LongNumMultiply( char *Vin1, char *Vin2, char *VoutH, char *VoutL,  unsigned N )
{

  // Create Temporal Long Integer with double size
  unsigned char *TEMP = (unsigned char*) calloc(2*N,sizeof(unsigned char));
  unsigned char *RES = (unsigned char*) calloc( 2*N,sizeof(unsigned char) );

  LongNumSet  ( RES, 2*N, 0 );    // Set RES to 0

  for ( int i=0; i<N; ++i )
  {
    LongNumSet  ( TEMP, 2*N, 0 );            // Set TEMP to 0
    LongNumCopy ( Vin1, TEMP+i, N );         // Copy Vin1 -> TEMP, with offset i
    LongNumConstMult( TEMP, 2*N, Vin2[i] );  // TEMP * Vin2[i] -> TEMP
    LongNumAddition ( TEMP, RES, RES, 2*N ); // TEMP + RES -> RES
  }

  // Result goes to VoutH-VoutL
  LongNumCopy ( RES,   VoutL, N );  // Copy RES   -> VoutL
  LongNumCopy ( RES+N, VoutH, N );  // Copy RES+N -> VoutH
}
int main (int argc, char **argv)
{
  int i, sum1, sum2, sum3, N=10000, Rep=50;

  seed = 12345;

  // obtain parameters at run time
  if (argc>1) { N    = atoi(argv[1]); }
  if (argc>2) { Rep  = atoi(argv[2]); }

  printf("Challenge #3: Vector size is %d. Repeat %d times\n", N, Rep);

  // Create Long Nums
  unsigned char *V1= (unsigned char*) malloc( N*sizeof(unsigned char) );
  unsigned char *V2= (unsigned char*) malloc( N*sizeof(unsigned char) );
  unsigned char *V3= (unsigned char*) malloc( N*sizeof(unsigned char) );
  unsigned char *V4= (unsigned char*) malloc( N*sizeof(unsigned char) );

  LongNumInit ( V1, N ); LongNumInit ( V2, N ); LongNumInit ( V3, N );

  // Repeat
  for (i=0; i<Rep; i++)
  {
    LongNumAddition ( V1, V2, V4, N );
    LongNumMultiply ( V3, V4, V2, V1, N );
    LongNumHorizAdd ( V1, V2, N );
    LongNumAddDigit ( V3, V2[0], N );
  }

  // Print last 32 digits of Long Numbers
  LongNumPrint( V1, 32, "V1" );
  LongNumPrint( V2, 32, "V2" );
  LongNumPrint( V3, 32, "V3" );
  LongNumPrint( V4, 32, "V4" );

  free(V1); free(V2); free(V3); free(V4);
  return 0;
}
Peter Cordes
  • 245,674
  • 35
  • 423
  • 606
  • you could check: https://stackoverflow.com/questions/33758993/why-doesnt-gcc-show-vectorization-information/33760780 – Nicolae Natea Apr 05 '20 at 17:03
  • Thank you, however this only shows me that parts do not vectorize does not tell me him because. For example in functions LongNumSet, LogNumAddition, LongNumAddDigit doesn't vectorize but I don't know what transformations I can perform so that the compiler can vectorize these functions. The errors obtained are as follows: couldn't vectorize loop, not vectorized: control flow in loop, not vectorized: control flow in loop... @NicolaeNatea – Desiree Pasion Rodriguez Sanch Apr 05 '20 at 17:33
  • 1
    Bigint addition and multiplication are notorious for being [annoying to vectorize](https://stackoverflow.com/q/8866973/555045). – harold Apr 05 '20 at 20:04
  • 2
    longnumset is not vectorized because the loop is replaced with a call to memset, even better. With `__restrict`, longnumcopy would become memcpy. – Marc Glisse Apr 05 '20 at 20:08
  • 2
    I don't see the point of manually vectorizing this. If you are doing it for performance, then `char` is the first thing to fix (you need to fix it anyway because you don't know if char is signed or not). – Marc Glisse Apr 05 '20 at 20:12

1 Answers1

1

Acording to your usage, instead of LongNumSet you could create and use LongNumClear(not much improvement). Below are some other potential rewrites of some of your functions. I think you should notice some improvements. For me it's around 44%. I also changed the type from char to unsigned.

#include <string.h>

void LongNumClear(uint8_t *L, size_t N) {
   memset (L, 0, N);
}

void LongNumCopy(const uint8_t *Vin, uint8_t *Vout, size_t N) {
   memcpy(Vout, Vin, N);
}

uint8_t LongNumAddition(uint8_t * Vin1, uint8_t * Vin2, uint8_t * Vout, size_t N) {
  uint8_t carry = 0;

  for (size_t i=0; i < N; ++i) {
    Vout[i] = Vin1[i] + Vin2[i] + carry;

    carry = (Vout[i] > 9);

    if (carry) {
        Vout[i] -= ten;
    }
  }

  return carry;
}

uint8_t LongNumAddDigit(uint8_t *V, uint8_t digit, size_t N) {
  size_t i=0;

  V[0] += digit;

  if (V[0] < ten) {
    return 0;
  }

  V[0] -= ten;

  while ((++i < N) && (V[i] >= 9)) {
      V[i] = 0;
  }

  if ((i != N) && (V[i] < 9)) {
      V[i]++;
      return 0;
  }

  return 1;
}

uint8_t LongNumConstMult(uint8_t *V, size_t N, uint8_t digit) {
  uint8_t carry = 0;

  for (size_t i = 0; i < N; ++i ) {
    V[i] = V[i] * digit + carry;
    carry = ((u_int32_t)V[i] * (u_int32_t)0xCCCD) >> 19; // divide by 10
    V[i] -= ((carry << 3) + (carry << 1));
  }

  return carry;
}
Nicolae Natea
  • 1,087
  • 8
  • 12