I have following codes:
void division_approximate(float a[], float b[], float c[], int n) {
// c[i] = a[i] * (1 / b[i]);
for (int i = 0; i < n; i+=8) {
__m256 b_val = _mm256_loadu_ps(b + i);
b_val = _mm256_rcp_ps(b_val);
__m256 a_val = _mm256_loadu_ps(a + i);
a_val = _mm256_mul_ps(a_val, b_val);
_mm256_storeu_ps(c + i, a_val);
}
}
void division(float a[], float b[], float c[], int n) {
// c[i] = a[i] / b[i];
for (int i = 0; i < n; i+=8) {
__m256 b_val = _mm256_loadu_ps(b + i);
__m256 a_val = _mm256_loadu_ps(a + i);
a_val = _mm256_div_ps(a_val, b_val);
_mm256_storeu_ps(c + i, a_val);
}
}
I would expect that division_approximate
is faster than division
, but both function takes almost the same time on my AMD Ryzen 7 4800H. I don't understand why, I would expect that the division_approximate
is significantly faster. This issue reproduces on both GCC an CLANG. Compiled with -O3 -march=core-avx2
.
UPDATE
Here is the source code generated by GCC 9.3 for both loops:
division
│ >0x555555555c38 <division+88> vmovups 0x0(%r13,%rax,4),%ymm3 │
│ 0x555555555c3f <division+95> vdivps (%r14,%rax,4),%ymm3,%ymm0 │
│ 0x555555555c45 <division+101> vmovups %ymm0,(%rbx,%rax,4) │
│ 0x555555555c4a <division+106> add $0x8,%rax │
│ 0x555555555c4e <division+110> cmp %eax,%r12d │
│ 0x555555555c51 <division+113> jg 0x555555555c38 <division+88> │
division_approximate
│ >0x555555555b38 <division_approximate+88> vrcpps (%r14,%rax,4),%ymm0 │
│ 0x555555555b3e <division_approximate+94> vmulps 0x0(%r13,%rax,4),%ymm0,%ymm0 │
│ 0x555555555b45 <division_approximate+101> vmovups %ymm0,(%rbx,%rax,4) │
│ 0x555555555b4a <division_approximate+106> add $0x8,%rax │
│ 0x555555555b4e <division_approximate+110> cmp %eax,%r12d │
│ 0x555555555b51 <division_approximate+113> jg 0x555555555b38 <division_approximate+88> │
Both codes take almost exactly the same amount of time to execute (318 ms vs 319 ms) for n = 256 * 1024 * 1024
.