15

Continuing on from my first question, I am trying to optimize a memory hotspot found via VTune profiling a 64-bit C program.

In particular, I'd like to find the fastest way to test if a 128-byte block of memory contains all zeros. You may assume any desired memory alignment for the memory block; I used 64-byte alignment.

I am using a PC with an Intel Ivy Bridge Core i7 3770 processor with 32 GB of memory and the free version of Microsoft vs2010 C compiler.

My first attempt was:

const char* bytevecM;    // 4 GB block of memory, 64-byte aligned
size_t* psz;             // size_t is 64-bits
// ...
// "m7 & 0xffffff80" selects the 128 byte block to test for all zeros
psz = (size_t*)&bytevecM[(unsigned int)m7 & 0xffffff80];
if (psz[0]  == 0 && psz[1]  == 0
&&  psz[2]  == 0 && psz[3]  == 0
&&  psz[4]  == 0 && psz[5]  == 0
&&  psz[6]  == 0 && psz[7]  == 0
&&  psz[8]  == 0 && psz[9]  == 0
&&  psz[10] == 0 && psz[11] == 0
&&  psz[12] == 0 && psz[13] == 0
&&  psz[14] == 0 && psz[15] == 0) continue;
// ...

VTune profiling of the corresponding assembly follows:

cmp    qword ptr [rax],      0x0       0.171s
jnz    0x14000222                     42.426s
cmp    qword ptr [rax+0x8],  0x0       0.498s
jnz    0x14000222                      0.358s
cmp    qword ptr [rax+0x10], 0x0       0.124s
jnz    0x14000222                      0.031s
cmp    qword ptr [rax+0x18], 0x0       0.171s
jnz    0x14000222                      0.031s
cmp    qword ptr [rax+0x20], 0x0       0.233s
jnz    0x14000222                      0.560s
cmp    qword ptr [rax+0x28], 0x0       0.498s
jnz    0x14000222                      0.358s
cmp    qword ptr [rax+0x30], 0x0       0.140s
jnz    0x14000222
cmp    qword ptr [rax+0x38], 0x0       0.124s
jnz    0x14000222
cmp    qword ptr [rax+0x40], 0x0       0.156s
jnz    0x14000222                      2.550s
cmp    qword ptr [rax+0x48], 0x0       0.109s
jnz    0x14000222                      0.124s
cmp    qword ptr [rax+0x50], 0x0       0.078s
jnz    0x14000222                      0.016s
cmp    qword ptr [rax+0x58], 0x0       0.078s
jnz    0x14000222                      0.062s
cmp    qword ptr [rax+0x60], 0x0       0.093s
jnz    0x14000222                      0.467s
cmp    qword ptr [rax+0x68], 0x0       0.047s
jnz    0x14000222                      0.016s
cmp    qword ptr [rax+0x70], 0x0       0.109s
jnz    0x14000222                      0.047s
cmp    qword ptr [rax+0x78], 0x0       0.093s
jnz    0x14000222                      0.016s

I was able to improve on that via Intel instrinsics:

const char* bytevecM;                        // 4 GB block of memory
__m128i* psz;                                // __m128i is 128-bits
__m128i one = _mm_set1_epi32(0xffffffff);    // all bits one
// ...
psz = (__m128i*)&bytevecM[(unsigned int)m7 & 0xffffff80];
if (_mm_testz_si128(psz[0], one) && _mm_testz_si128(psz[1], one)
&&  _mm_testz_si128(psz[2], one) && _mm_testz_si128(psz[3], one)
&&  _mm_testz_si128(psz[4], one) && _mm_testz_si128(psz[5], one)
&&  _mm_testz_si128(psz[6], one) && _mm_testz_si128(psz[7], one)) continue;
// ...

VTune profiling of the corresponding assembly follows:

movdqa xmm0, xmmword ptr [rax]         0.218s
ptest  xmm0, xmm2                     35.425s
jnz    0x14000ddd                      0.700s
movdqa xmm0, xmmword ptr [rax+0x10]    0.124s
ptest  xmm0, xmm2                      0.078s
jnz    0x14000ddd                      0.218s
movdqa xmm0, xmmword ptr [rax+0x20]    0.155s
ptest  xmm0, xmm2                      0.498s
jnz    0x14000ddd                      0.296s
movdqa xmm0, xmmword ptr [rax+0x30]    0.187s
ptest  xmm0, xmm2                      0.031s
jnz    0x14000ddd
movdqa xmm0, xmmword ptr [rax+0x40]    0.093s
ptest  xmm0, xmm2                      2.162s
jnz    0x14000ddd                      0.280s
movdqa xmm0, xmmword ptr [rax+0x50]    0.109s
ptest  xmm0, xmm2                      0.031s
jnz    0x14000ddd                      0.124s
movdqa xmm0, xmmword ptr [rax+0x60]    0.109s
ptest  xmm0, xmm2                      0.404s
jnz    0x14000ddd                      0.124s
movdqa xmm0, xmmword ptr [rax+0x70]    0.093s
ptest  xmm0, xmm2                      0.078s
jnz    0x14000ddd                      0.016s

As you can see, there are fewer assembly instructions and this version further proved to be faster in timing tests.

Since I am quite weak in the area of Intel SSE/AVX instructions, I welcome advice on how they might be better employed to speed up this code.

Though I scoured the hundreds of available instrinsics, I may have missed the ideal ones. In particular, I was unable to effectively employ _mm_cmpeq_epi64(); I looked for a "not equal" version of this instrinsic (which seems better suited to this problem) but came up dry. Though the below code "works":

if (_mm_testz_si128(_mm_andnot_si128(_mm_cmpeq_epi64(psz[7], _mm_andnot_si128(_mm_cmpeq_epi64(psz[6], _mm_andnot_si128(_mm_cmpeq_epi64(psz[5], _mm_andnot_si128(_mm_cmpeq_epi64(psz[4], _mm_andnot_si128(_mm_cmpeq_epi64(psz[3], _mm_andnot_si128(_mm_cmpeq_epi64(psz[2], _mm_andnot_si128(_mm_cmpeq_epi64(psz[1], _mm_andnot_si128(_mm_cmpeq_epi64(psz[0], zero), one)), one)), one)), one)), one)), one)), one)), one), one)) continue;

it is borderline unreadable and (unsurprisingly) proved to be way slower than the two versions given above. I feel sure there must be a more elegant way to employ _mm_cmpeq_epi64() and welcome advice on how that might be achieved.

In addition to using intrinsics from C, raw Intel assembly language solutions to this problem are also welcome.

  • 9
    “the fastest way to test if a 128-byte block of memory contains all zeros” You didn't say whether you expected the test to turn out true or false. If you expect it to turn out true, you could use a single conditional branch instead of plenty (compute the bitwise or then test). – Pascal Cuoq Mar 02 '13 at 07:54
  • Just for fun, could you try replacing `&&` with `&` there to avoid short-circuiting? – Sergey Kalinichenko Mar 02 '13 at 07:55
  • 4
    @PascalCuoq: Judging from the hitrate of his first `jnz` I'd say it mostly fails. – Ben Jackson Mar 02 '13 at 08:10
  • 4
    @BenJackson Right. And if it usually fails at the first test there is little opportunity to speed up anything. – Pascal Cuoq Mar 02 '13 at 08:15
  • I have often found the VTune profiling results confusing. The 128-byte memory block is all zeros around 98 times out of 100. That is, finding a non-zero block is a rare event. That is why this approach is already providing a significant speed up, allowing me to skip the more expensive inner loop in the common case of an all zeros 128-byte memory block. – eyepopslikeamosquito Mar 02 '13 at 08:31
  • 4
    Is the 128 bytes already in cache or still in main memory? If in main memory, you are likely to spend more time stalling on the cache load than processing. – rlb Mar 02 '13 at 11:05
  • @rlb Given the (4GB) size of the table, the 128 bytes will almost certainly not be in cache. As described in my original question (one week ago, no answers), this is clearly a "data locality" problem. Unfortunately, I've been so far been unable to find an algorithm with significantly better data locality. That said, reading a 128-byte chunk up front improves data locality over the original and has produced significant savings by bypassing the inner loop in most cases (i.e. when the 128-byte block contains all zeros). To go further I need to find a new algorithm with better data locality. – eyepopslikeamosquito Mar 02 '13 at 12:07
  • 1
    Do you have to be 100% sure, or just pretty sure? You could store a checksum when writing the block. Then when you want to test, you only have to look at the checksum. (Just trying to present an idea "outside the box") – Josh Petitt Mar 02 '13 at 12:50
  • 1
    Also, since it appears to you are aggressively optimizing, would any of the 64bit chunks be more likely to have non-zero values than others? (Based on how they are used?) If so, check those chunks first. – Josh Petitt Mar 02 '13 at 12:55
  • 3
    The problem, as others have pointed out, is that the 128-byte data you are checking is missing the data cache and going to DRAM, which is slow. VTune is telling you this: jnz 0x14000222 42.426s Those 42.4 seconds accounted to the JNZ instruction are really a stall caused by the previous load from memory... the processor is sitting around doing nothing for 42 seconds total over the time you profiled the program...waiting on DRAM. – amdn Mar 02 '13 at 15:36
  • Oops, after writing a test program to tally all blocks, it seems that a block is all zeros 36 times out of a 100, not 98 out of 100 as reported previously. Sorry 'bout that. The excellent advice by amdn about cache misses was still valid though and I've successfully applied it by adding 4 MB bitmaps to improve data locality and so significantly reduce cache misses. – eyepopslikeamosquito Mar 09 '13 at 22:42
  • If you only writing each array element once and then checking all the elements for zeros, wouldn't it be quicker to AND the value being written to the element with a static variable every time and then when you need to check for all zeros you just check whether this static variable is 0? You would never need to "loop" (or use intrinsics) on the whole array? – user997112 Jun 29 '14 at 01:46

6 Answers6

14

The main problem, as others have pointed out, is that the 128-byte data you are checking is missing the data cache and/or the TLB and going to DRAM, which is slow. VTune is telling you this

cmp    qword ptr [rax],      0x0       0.171s
jnz    0x14000222                     42.426s

You have another, smaller, hotspot half-way down

cmp    qword ptr [rax+0x40], 0x0       0.156s
jnz    0x14000222                      2.550s

Those 42.4 + 2.5 seconds accounted to the JNZ instructions are really a stall caused by the previous load from memory... the processor is sitting around doing nothing for 45 seconds total over the time you profiled the program...waiting on DRAM.

You might ask what the 2nd hotspot half-way down is all about. Well, you are accessing 128-bytes and cache lines are 64-bytes, the CPU started prefetching for you as soon as it read the first 64-bytes... but you didn't do enough work with the first 64-bytes to totally overlap the latency of going to memory.

The memory bandwidth of Ivy Bridge is very high (it depends on your system, but I'm guessing over 10 GB/sec). Your block of memory is 4GB, you should be able to zip thru it in less than 1 second if you access it sequentially and let the CPU prefetch data ahead for you.

My guess is you are thwarting the CPU data prefetcher by accessing the 128-byte blocks in a non-contiguous fashion.

Change your access pattern to be sequential and you'll be surprised how much faster it runs. You can then worry about the next level of optimization, which will be making sure the branch prediction works well.

Another thing to consider is TLB misses. Those are costly, especially in a 64-bit system. Rather than using 4KB pages consider using 2MB huge pages. See this link for Windows support for these: Large-Page Support (Windows)

If you must access the 4GB data in a somewhat random fashion, but you know ahead of time the sequence of m7 values (your index) then you can pipeline the memory fetching explicitly ahead of your use (it needs to be several 100 CPU cycles ahead of when you will be using it to be effective). See

Here are some links that might be helpful in general on the subject of memory optimizations

What Every Programmer Should Know About Memory by Ulrich Drepper

http://www.akkadia.org/drepper/cpumemory.pdf

Machine Architecture: Things Your Programming Language Never Told You, by Herb Sutter

http://www.gotw.ca/publications/concurrency-ddj.htm

http://nwcpp.org/static/talks/2007/Machine_Architecture_-_NWCPP.pdf

http://video.google.com/videoplay?docid=-4714369049736584770#

amdn
  • 10,570
  • 28
  • 42
5

Sorry for the answer post, I don't have enough reputation for comments.
What happens if you use the following as a test?

if( (psz[0]  | psz[1]  | psz[2]  | psz[3]  |
     psz[4]  | psz[5]  | psz[6]  | psz[7]  |
     psz[8]  | psz[9]  | psz[10] | psz[11] |
     psz[12] | psz[13] | psz[14] | psz[15] ) == 0) continue;

Unfortunately, I don't have a 64-bit system on which to compile it, and I am unfamiliar with what exactly the compiler does with c code, but it would seem to me that a binary or would be faster than individual == comparisons. I also don't know what Intel intrinsics are, but it may be possible to optimize the above code in a similar manner to what you have already done.
I hope my answer helps.
Mmarss

Mmarss
  • 178
  • 5
2

At 98% of 128-byte blocks being all zero, you're averaging less than one nonzero byte per 4K page. With an array that sparse, have you tried storing it as a sparse array? You'll save huge swathes of memory and the attendant cache-miss delays; I wouldn't be surprised if a plain std::map turns out faster.

jthill
  • 42,819
  • 4
  • 65
  • 113
2

Have you considered the Intel string scan instructions? These tend to have very high data rates, and the processor knows the data access is sequential.

     mov      rdi, <blockaddress>
     cld
     xor      rax, rax
     mov      rcx, 128/8
     repe     scasq
     jne      ...

This won't help the problem of your data not being in cache. You might fix that by using Intel's prefetch instruction if you know which chunk you want to consider well in advance. See http://software.intel.com/en-us/articles/use-software-data-prefetch-on-32-bit-intel-architecture

[EDITS to code to intergrate minor hiccups pointed out in comments]

Ira Baxter
  • 88,629
  • 18
  • 158
  • 311
  • For x86-64, I think it should be `mov rdi, `; `cld`; `xor eax,eax`; `mov rcx,128/8`; `repe scasq` ... – nrz Mar 03 '13 at 00:49
  • Well, yes, I suppose so. While we're at it, xor rax, rax ..., too. – Ira Baxter Mar 03 '13 at 03:14
  • `xor eax,eax` would be shorter (no prefix) and achieve the same result (due to zero-extension). Same for `mov ecx,128/8`. – Brendan Apr 30 '17 at 22:18
1

Thanks for the excellent tips received so far.

I felt confident that the Mmarss "mega or" approach would improve performance because it generated fewer assembly language instructions. However, when I ran my benchmark program, it took 163 seconds versus 150 seconds for my original clunky && solution and 145 seconds for my original clunky Intel instrinsics solution (these two are described in my original post).

For completeness, here is the C code I used for the "mega or" approach:

if ((psz[0]  | psz[1]  | psz[2]  | psz[3]
|    psz[4]  | psz[5]  | psz[6]  | psz[7]
|    psz[8]  | psz[9]  | psz[10] | psz[11]
|    psz[12] | psz[13] | psz[14] | psz[15]) == 0) continue;

The VTune assembly was:

mov    rax, qword ptr [rcx+0x78]    0.155s
or     rax, qword ptr [rcx+0x70]   80.972s
or     rax, qword ptr [rcx+0x68]    1.292s
or     rax, qword ptr [rcx+0x60]    0.311s
or     rax, qword ptr [rcx+0x58]    0.249s
or     rax, qword ptr [rcx+0x50]    1.229s
or     rax, qword ptr [rcx+0x48]    0.187s
or     rax, qword ptr [rcx+0x40]    0.233s
or     rax, qword ptr [rcx+0x38]    0.218s
or     rax, qword ptr [rcx+0x30]    1.742s
or     rax, qword ptr [rcx+0x28]    0.529s
or     rax, qword ptr [rcx+0x20]    0.233s
or     rax, qword ptr [rcx+0x18]    0.187s
or     rax, qword ptr [rcx+0x10]    1.244s
or     rax, qword ptr [rcx+0x8]     0.155s
or     rax, qword ptr [rcx]         0.124s
jz     0x1400070b9                  0.342s

I then tried translating the "mega or" idea to Intel instrinsics via:

__m128i tt7;
// ...
tt7 = _mm_or_si128(_mm_or_si128(_mm_or_si128(psz[0], psz[1]),
      _mm_or_si128(psz[2], psz[3])),
      _mm_or_si128(_mm_or_si128(psz[4], psz[5]),
      _mm_or_si128(psz[6], psz[7])));
if ( (tt7.m128i_i64[0] | tt7.m128i_i64[1]) == 0) continue;

though also that turned out to be slower, taking 155 seconds. Its VTune assembly was:

movdqa xmm2, xmmword ptr [rax]         0.047s
movdqa xmm0, xmmword ptr [rax+0x20]   75.461s
movdqa xmm1, xmmword ptr [rax+0x40]    2.567s
por    xmm0, xmmword ptr [rax+0x30]    1.867s
por    xmm2, xmmword ptr [rax+0x10]    0.078s
por    xmm1, xmmword ptr [rax+0x50]    0.047s
por    xmm2, xmm0                      0.684s
movdqa xmm0, xmmword ptr [rax+0x60]    0.093s
por    xmm0, xmmword ptr [rax+0x70]    1.214s
por    xmm1, xmm0                      0.420s
por    xmm2, xmm1                      0.109s
movdqa xmmword ptr tt7$[rsp], xmm2     0.140s
mov    rax, qword ptr [rsp+0x28]       0.233s
or     rax, qword ptr [rsp+0x20]       1.027s
jz     0x1400070e2                     0.498s

The Intel instrinsics approach above is pretty crude. Suggestions for improving it are welcome.

This shows yet again how important it is to measure. Almost every time I've guessed which would be faster I've been wrong. That said, so long as you carefully measure each change, you can't get any worse, can only improve. Though I've gone backwards (as above) more often than forwards, over the past week I've been able to reduce the running time of the little test program from 221 seconds down to 145. Given that the real program will be running for months, that will save days.

  • 6
    Your compiler is stupidly compiling `a | b | c | d | …` as a long dependency chain. Force it to generate instructions with more fine-grain parallelism and check the generated assembly. Also the optimal pattern may depend on the number of available registers, but it should not hurt to do at least `(a | b) | (c | d) | …` – Pascal Cuoq Mar 02 '13 at 12:32
  • 1
    Note: The dependency chain in this case doesn't matter. Intel CPUs use logical register mapping internally. The compiler could "stupidly" use rax for the whole process and it would still yield the same result on an i7 (well, up to about 32 cases .. 16 is well below that). The only dependency is the Jump (jz) at the end, which requires everything to have been collected regardless before the result can be processed. – jstine Mar 02 '13 at 16:32
  • 1
    Ok, after further consideration and noticing that you stated that the input data is actually mostly zero, I have to side with others and say you've hit the RAM/Cache locality issue. The `OR` versions here are unexpectedly slower because they insist on reading from memory out of natural order -- eg, x64 is starting at +0x78 and working backwards, and the xmm one is reading 0x00, 0x20, 0x40, and then goes back to 0x30, etc. Since your bottleneck is almost entirely memory, using natural-order 128 bit reads is very important for max memory throughput. – jstine Mar 02 '13 at 16:43
  • 2
    That said, I think `OR` versions should be more efficient *if* you could convince the compiler to read memory in-order. It is rather frustrating really that its not. – jstine Mar 02 '13 at 16:45
  • 2
    @jstine “The dependency chain in this case doesn't matter” Yes it does. The processor renames registers, yes, but that does not allow executing more than one instruction per cycle for the sequence of `or rax,…` shown in this answer. It could be executing these three at a time with a better sequence of instruction (not that this matters if memory bandwidth is the limiting factor). – Pascal Cuoq Mar 03 '13 at 22:15
  • 1
    @jstine: You can convince the compiler to read mostly in-order; use `__m128i v[8] = { ptr[0], ptr[1], ... }; v[0] = _mm_or_si128(v[0], v[1]); v[2] = _mm_or_si128(v[2], v[3]); ...` i.e. "map reduce" from `8 -> 4 -> 2 -> 1` (try it and check the resulting code). But as already said, if you're memory bound that hardly matters. – FrankH. Mar 11 '13 at 19:33
0

Suggestion: align your array to 128B, so the spatial prefetcher will always want to fill the correct cache-line to make a 128B pair of cache lines. Intel optimization manual, page 2-30 (pg 60 of the PDF), describing Sandybridge/Ivybridge:

Spatial Prefetcher: This prefetcher strives to complete every cache line fetched to the L2 cache with the pair line that completes it to a 128-byte aligned chunk.

With your array only aligned to 64B, reading 128B can touch two pairs of cache lines, leading the L2 spatial prefetcher to issue more loads for data you'll probably never use.


Your answer has the right idea: OR the block together with vectors, then test that for all-zero. Using a single branch is probably better than branching separately on every 8 bytes.

But your strategy for testing a vector sucks: don't store it and then scalar load+OR both halves. This is a perfect use-case for SSE4 PTEST, which lets us avoid the usual pcmpeqb / pmovmskb:

ptest   xmm0,xmm0      ; 2 uops, and Agner Fog lists it as 1c latency for SnB/IvB, but this is probably bogus.  2c is more likely
jz    vector_is_all_zero
; 3 uops, but shorter latency and smaller code-size than pmovmskb

Normally branches predict well, and latency to generate their flag inputs isn't important. But in this case, the main bottleneck is branch mispredicts. So it's probably worth it to spend more uops (if necessary) to reduce latency.


I'm not sure whether it's better to test the first cache line before loading the second cache line, in case you find a non-zero byte without suffering the second cache miss. The spatial-prefetcher can't get the second cache line loaded instantly, so probably try an early-out before loading the second 64B cache line, unless that leads to a lot of extra branch mispredicts.

So I might do:

allzero_128B(const char *buf)
{
    const __m128i *buf128 = (const __m128i*)buf;  // dereferencing produces 128b aligned-load instructions

    __m128i or0 = _mm_or_si128(buf[0], buf[2]);
    __m128i or2 = _mm_or_si128(buf[1], buf[3]);
    __m128i first64 = _mm_or_si128(or0, or2);
    // A chain of load + 3 OR instructions would be fewer fused-domain uops
    //  than load+or, load+or, or(xmm,xmm).  But resolving the branch faster is probably the most important thing.

    if (_mm_testz_si128(first64, first64)
        return 0;

    __m128i or4 = _mm_or_si128(buf[4], buf[6]);
    __m128i or6 = _mm_or_si128(buf[5], buf[7]);
    __m128i first64 = _mm_or_si128(or4, or6);


}

On IvyBrige, there's not much if anything to gain from using 256b AVX ops. Vector-FP 256b VORPS ymm does twice as much work per uop, but only runs on port5. (POR xmm runs on p015). 256b loads are done as two 128b halves, but they are still only 1 uop.

I don't see a way to use a single CMPEQPS to check a 256b vector for all-zero. +0.0 compares equal to -0.0, so a 1-bit in the sign-bit position would go undetected in a compare against zero. I don't think any of the CMPPS predicates help, either, since none of them implement compares that treat -0.0 different from +0.0. (See SIMD instructions for floating point equality comparison (with NaN == NaN) for more about FP-equality).

; First 32B arrives in L1D (and load buffers) on cycle n
vmovaps  ymm0,   [rdi+64]              ; ready on cycle n+1  (256b loads take 2 cycles)
vorps    ymm0,   ymm0, [rdi+96]        ; ready on cycle n+3  (the load uop is executing on cycles n+1 and n+2)
vextractf128 xmm1, ymm0, 1           ; 2c latency on IvB, 3c on Haswell
                                     ; xmm1 ready on cycle n+5
vpor     xmm0,   xmm0, xmm1          ; ready on n+6 (should be no bypass delay for a shuffle (vextractf128) -> integer booleans)
vptest   xmm0,   xmm0
jz   second_cacheline_all_zero

No, that's not better than

; First 32B of the cache-line arrives in L1D on cycle n (IvB has a 32B data path from L2->L1)
vmovaps  xmm0,   [rdi+64]              ; result ready on cycle n
vmovaps  xmm1,   [rdi+64 + 16]         ; result ready on cycle n  (data should be forwarded to outstanding load buffers, I think?)
vpor     xmm0,   xmm0, [rdi+64 + 32]   ; ready on cycle n+1
vpor     xmm1,   xmm1, [rdi+64 + 48]   ; ready on cycle n+1, assuming the load uops get their data the cycle after the first pair.
vpor     xmm0,   xmm1                  ; ready on cycle n+2
vptest   xmm0,   xmm0
jz   second_cacheline_all_zero

With AVX2, 256b ops would make sense, including VPTEST ymm,ymm.

Community
  • 1
  • 1
Peter Cordes
  • 245,674
  • 35
  • 423
  • 606