I am trying to convert a loop I have into a SSE intrinsics. I seem to have made fairly good progress, and by that I mean It's in the correct direction however I appear to have done some of the translation wrong somewhere as I am not getting the same "correct" answer which results from the non-sse code.
My original loop which I unrolled by a factor of 4 looks like this:
int unroll_n = (N/4)*4;
for (int j = 0; j < unroll_n; j++) {
for (int i = 0; i < unroll_n; i+=4) {
float rx = x[j] - x[i];
float ry = y[j] - y[i];
float rz = z[j] - z[i];
float r2 = rx*rx + ry*ry + rz*rz + eps;
float r2inv = 1.0f / sqrt(r2);
float r6inv = r2inv * r2inv * r2inv;
float s = m[j] * r6inv;
ax[i] += s * rx;
ay[i] += s * ry;
az[i] += s * rz;
//u
rx = x[j] - x[i+1];
ry = y[j] - y[i+1];
rz = z[j] - z[i+1];
r2 = rx*rx + ry*ry + rz*rz + eps;
r2inv = 1.0f / sqrt(r2);
r6inv = r2inv * r2inv * r2inv;
s = m[j] * r6inv;
ax[i+1] += s * rx;
ay[i+1] += s * ry;
az[i+1] += s * rz;
//unroll i 3
rx = x[j] - x[i+2];
ry = y[j] - y[i+2];
rz = z[j] - z[i+2];
r2 = rx*rx + ry*ry + rz*rz + eps;
r2inv = 1.0f / sqrt(r2);
r6inv = r2inv * r2inv * r2inv;
s = m[j] * r6inv;
ax[i+2] += s * rx;
ay[i+2] += s * ry;
az[i+2] += s * rz;
//unroll i 4
rx = x[j] - x[i+3];
ry = y[j] - y[i+3];
rz = z[j] - z[i+3];
r2 = rx*rx + ry*ry + rz*rz + eps;
r2inv = 1.0f / sqrt(r2);
r6inv = r2inv * r2inv * r2inv;
s = m[j] * r6inv;
ax[i+3] += s * rx;
ay[i+3] += s * ry;
az[i+3] += s * rz;
}
}
I essentially then went line by line for the top section and converted it into SSE intrinsics. The code is below. I'm not totally sure if the top three lines are needed however I understand that my data needs to be 16bit aligned for this to work correctly and optimally.
float *x = malloc(sizeof(float) * N);
float *y = malloc(sizeof(float) * N);
float *z = malloc(sizeof(float) * N);
for (int j = 0; j < N; j++) {
for (int i = 0; i < N; i+=4) {
__m128 xj_v = _mm_set1_ps(x[j]);
__m128 xi_v = _mm_load_ps(&x[i]);
__m128 rx_v = _mm_sub_ps(xj_v, xi_v);
__m128 yj_v = _mm_set1_ps(y[j]);
__m128 yi_v = _mm_load_ps(&y[i]);
__m128 ry_v = _mm_sub_ps(yj_v, yi_v);
__m128 zj_v = _mm_set1_ps(z[j]);
__m128 zi_v = _mm_load_ps(&z[i]);
__m128 rz_v = _mm_sub_ps(zj_v, zi_v);
__m128 r2_v = _mm_mul_ps(rx_v, rx_v) + _mm_mul_ps(ry_v, ry_v) + _mm_mul_ps(rz_v, rz_v) + _mm_set1_ps(eps);
__m128 r2inv_v = _mm_div_ps(_mm_set1_ps(1.0f),_mm_sqrt_ps(r2_v));
__m128 r6inv_1v = _mm_mul_ps(r2inv_v, r2inv_v);
__m128 r6inv_v = _mm_mul_ps(r6inv_1v, r2inv_v);
__m128 mj_v = _mm_set1_ps(m[j]);
__m128 s_v = _mm_mul_ps(mj_v, r6inv_v);
__m128 axi_v = _mm_load_ps(&ax[i]);
__m128 ayi_v = _mm_load_ps(&ay[i]);
__m128 azi_v = _mm_load_ps(&az[i]);
__m128 srx_v = _mm_mul_ps(s_v, rx_v);
__m128 sry_v = _mm_mul_ps(s_v, ry_v);
__m128 srz_v = _mm_mul_ps(s_v, rz_v);
axi_v = _mm_add_ps(axi_v, srx_v);
ayi_v = _mm_add_ps(ayi_v, srx_v);
azi_v = _mm_add_ps(azi_v, srx_v);
_mm_store_ps(ax, axi_v);
_mm_store_ps(ay, ayi_v);
_mm_store_ps(az, azi_v);
}
}
I feel the main idea is correct however there is an/some error(s) somewhere as the resulting answer is incorrect.