How can I force emscripten/em++/llvm to load constants from .rodata and/or perform better SIMD optimization?

Question

I'm an active author and maintainer of the SSIM.js and jest-image-snapshot. Currently, I'm working to optimize our image processing implementations to leverage WebAssembly where it can provide a performance improvement.

Right now, I'm noticing that the code being generated adds unnecessary instructions from both the llvm assembly (webassembly text?) output perspective, as well as, the actual assembly output from Node.js (--print-wasm-code). Of particular note, it does super weird stuff when loading constants. For instance, look at the array named multiplier or the constant rounder in the three sections of code below. On GCC, multiplier would be stored in the .rodata section of the assembly to be loaded once or converted to an integer, and rounder would be inlined with a movd or movq. Here it seems to be inserting the values on each round of the loop. It's also doing some stuff with vpblendw that I'm totally clueless on.

How do I fix this?

alignas(64) const static uint16_t multiplierArray[8]= {77,150,29,1,77,160,29,1};
extern "C"
int rgba2y(void* inputDataBuffer, ptrdiff_t length) {
        typedef __u8x16 v8x16;
        typedef __u16x8 v16x8;
        v8x16* pInputPtr = (v8x16*) inputDataBuffer;
        v8x16* pInputPtrEnd = (v8x16*)((uint8_t*)inputDataBuffer + length);
        v8x16* pOutputPtr = (v8x16*) inputDataBuffer;
        __m128i rounder = _mm_cvtsi32_si128(0x80808080);
        v8x16 zero;
        zero ^= zero;
        __m128i multiplier = *((__m128i*)multiplierArray);
//      v16x8 multiplier = wasm_i64x2_splat(0x1001d0096004d);
        unsigned i = 0;
        for (; (i+4)*sizeof(__m128i)<= length; i+= 4) {
                v8x16 iv0 = wasm_v8x16_shuffle(pInputPtr[i/4],rounder,0,1,2,16,4,5,6,16,8,9,10,16,12,13,14,16);
                v8x16 iv1 = wasm_v8x16_shuffle(pInputPtr[i/4+1],rounder,0,1,2,16,4,5,6,16,8,9,10,16,12,13,14,16);
                v8x16 iv2 = wasm_v8x16_shuffle(pInputPtr[i/4+2],rounder,0,1,2,16,4,5,6,16,8,9,10,16,12,13,14,16);
                v8x16 iv3 = wasm_v8x16_shuffle(pInputPtr[i/4+3],rounder,0,1,2,16,4,5,6,16,8,9,10,16,12,13,14,16);
                // rg ba rg ba rg ba rg ba rg ba rg ba rg ba
                __m128i rg0 = _mm_hadd_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8((__m128i)iv0, (__m128i)zero),(__m128i)multiplier), _mm_mullo_epi16(_mm_unpackhi_epi8((__m128i)iv0,(__m128i)zero),(__m128i)multiplier));
                __m128i rg1 = _mm_hadd_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8((__m128i)iv1,(__m128i)zero),(__m128i)multiplier), _mm_mullo_epi16(_mm_unpackhi_epi8((__m128i)iv1,(__m128i)zero),(__m128i)multiplier));
                __m128i rg2 = _mm_hadd_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8((__m128i)iv2,(__m128i)zero),(__m128i)multiplier), _mm_mullo_epi16(_mm_unpackhi_epi8((__m128i)iv2,(__m128i)zero),(__m128i)multiplier));
                __m128i rg3 = _mm_hadd_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8((__m128i)iv3,(__m128i)zero),(__m128i)multiplier), _mm_mullo_epi16(_mm_unpackhi_epi8((__m128i)iv3,(__m128i)zero),(__m128i)multiplier));
                // rgba rgba rgba rgba rgba rgba rgba rgba
                __m128i rgba0 = wasm_u16x8_shr(_mm_hadd_epi16(rg0,rg1), 8);
                __m128i rgba1 = wasm_u16x8_shr(_mm_hadd_epi16(rg2,rg3), 8);
                pOutputPtr[i/4] = wasm_u8x16_narrow_i16x8(rgba0,rgba1);
        }
        // abbreviated...
        return 0;
}

The llvm assembly is:

    .section    .text.rgba2y,"",@
    .hidden rgba2y                          # -- Begin function rgba2y
    .globl  rgba2y
    .type   rgba2y,@function
rgba2y:                                 # @rgba2y
.Lfunc_begin0:
    .loc    2 56 0                          # rgb2y-sample.cpp:56:0
    .functype   rgba2y (i32, i32) -> (i32)
    .local      i32, i32, v128, v128, v128, v128, v128, v128
# %bb.0:                                # %entry
    #DEBUG_VALUE: rgba2y:length <- %4
    #DEBUG_VALUE: rgba2y:pInputPtrEnd <- undef
    #DEBUG_VALUE: rgba2y:i <- 0
    #DEBUG_VALUE: rgba2y:inputDataBuffer <- %3
    #DEBUG_VALUE: rgba2y:pInputPtr <- %3
    #DEBUG_VALUE: rgba2y:pOutputPtr <- %3
    #DEBUG_VALUE: rgba2y:rounder <- undef
    #DEBUG_VALUE: rgba2y:zero <- undef
    #DEBUG_VALUE: rgba2y:multiplier <- undef
    block
.Ltmp0:
    .loc    2 68 30 prologue_end            # rgb2y-sample.cpp:68:30
    local.get   1
    i32.const   64
    i32.lt_u
.Ltmp1:
    .loc    2 68 2 is_stmt 0                # rgb2y-sample.cpp:68:2
    br_if       0                               # 0: down to label0
.Ltmp2:
# %bb.1:
    .loc    2 0 2                           # rgb2y-sample.cpp:0:2
    i32.const   0
    local.set   2
    i32.const   4
    local.set   3
.LBB0_2:                                # %for.body
                                        # =>This Inner Loop Header: Depth=1
    loop                                        # label1:
.Ltmp3:
    #DEBUG_VALUE: rgba2y:i <- %101
    #DEBUG_VALUE: rgba0 <- undef
    #DEBUG_VALUE: rgba1 <- undef
    .loc    2 69 15 is_stmt 1               # rgb2y-sample.cpp:69:15
    local.get   0
    local.get   2
    i32.const   2
    i32.shl
    i32.add
    local.tee   2
    local.get   2
    v128.load   0
    i32.const   0
    i8x16.splat
    local.tee   4
    i32.const   -128
    i8x16.replace_lane  0
    i32.const   -128
    i8x16.replace_lane  1
    i32.const   -128
    i8x16.replace_lane  2
    i32.const   -128
    i8x16.replace_lane  3
    local.tee   5
    v8x16.shuffle   0, 1, 2, 16, 4, 5, 6, 16, 8, 9, 10, 16, 12, 13, 14, 16
.Ltmp4:
    .loc    2 74 48                         # rgb2y-sample.cpp:74:48
    local.tee   6
.Ltmp5:
    #DEBUG_VALUE: iv0 <- undef
    #DEBUG_VALUE: iv0 <- %153
    local.get   4
    v8x16.shuffle   0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23
    i32.const   77
    .loc    2 74 32 is_stmt 0               # rgb2y-sample.cpp:74:32
    i16x8.splat
    i32.const   150
    i16x8.replace_lane  1
    i32.const   29
    i16x8.replace_lane  2
    i32.const   1
    i16x8.replace_lane  3
    i32.const   160
    i16x8.replace_lane  5
    i32.const   29
    i16x8.replace_lane  6
    i32.const   1
    i16x8.replace_lane  7
    local.tee   7
    i16x8.mul
    .loc    2 74 133                        # rgb2y-sample.cpp:74:133
    local.tee   8
    local.get   6
    local.get   4
    v8x16.shuffle   8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31
    .loc    2 74 117                        # rgb2y-sample.cpp:74:117
    local.get   7
    i16x8.mul
    .loc    2 74 17                         # rgb2y-sample.cpp:74:17
    local.tee   6
    v8x16.shuffle   2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
    local.get   8
    local.get   6
    v8x16.shuffle   0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
    i16x8.add
.Ltmp6:
    .loc    2 0 17                          # rgb2y-sample.cpp:0:17
    local.tee   6
.Ltmp7:
    #DEBUG_VALUE: rg0 <- undef
    #DEBUG_VALUE: rg0 <- %153
    .loc    2 70 15 is_stmt 1               # rgb2y-sample.cpp:70:15
    local.get   2
    i32.const   16
    i32.add
    v128.load   0
    local.get   5
    v8x16.shuffle   0, 1, 2, 16, 4, 5, 6, 16, 8, 9, 10, 16, 12, 13, 14, 16
.Ltmp8:
    .loc    2 75 62                         # rgb2y-sample.cpp:75:62
    local.tee   8
.Ltmp9:
    #DEBUG_VALUE: iv1 <- undef
    #DEBUG_VALUE: iv1 <- %157
    local.get   4
    v8x16.shuffle   0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23
    .loc    2 75 46 is_stmt 0               # rgb2y-sample.cpp:75:46
    local.get   7
    i16x8.mul
    .loc    2 75 146                        # rgb2y-sample.cpp:75:146
    local.tee   9
    local.get   8
    local.get   4
    v8x16.shuffle   8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31
    .loc    2 75 130                        # rgb2y-sample.cpp:75:130
    local.get   7
    i16x8.mul
    .loc    2 75 31                         # rgb2y-sample.cpp:75:31
    local.tee   8
    v8x16.shuffle   2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
    local.get   9
    local.get   8
    v8x16.shuffle   0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
    i16x8.add
.Ltmp10:
    .loc    2 79 33 is_stmt 1               # rgb2y-sample.cpp:79:33
    local.tee   8
.Ltmp11:
    #DEBUG_VALUE: rg1 <- undef
    #DEBUG_VALUE: rg1 <- %157
    v8x16.shuffle   2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
    local.get   6
    local.get   8
    v8x16.shuffle   0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
    i16x8.add
    i32.const   8
    .loc    2 79 18 is_stmt 0               # rgb2y-sample.cpp:79:18
    i16x8.shr_u
    .loc    2 71 15 is_stmt 1               # rgb2y-sample.cpp:71:15
    local.get   2
    i32.const   32
    i32.add
    v128.load   0
    local.get   5
    v8x16.shuffle   0, 1, 2, 16, 4, 5, 6, 16, 8, 9, 10, 16, 12, 13, 14, 16
.Ltmp12:
    .loc    2 76 62                         # rgb2y-sample.cpp:76:62
    local.tee   6
.Ltmp13:
    #DEBUG_VALUE: iv2 <- undef
    #DEBUG_VALUE: iv2 <- %153
    local.get   4
    v8x16.shuffle   0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23
    .loc    2 76 46 is_stmt 0               # rgb2y-sample.cpp:76:46
    local.get   7
    i16x8.mul
    .loc    2 76 146                        # rgb2y-sample.cpp:76:146
    local.tee   8
    local.get   6
    local.get   4
    v8x16.shuffle   8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31
    .loc    2 76 130                        # rgb2y-sample.cpp:76:130
    local.get   7
    i16x8.mul
    .loc    2 76 31                         # rgb2y-sample.cpp:76:31
    local.tee   6
    v8x16.shuffle   2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
    local.get   8
    local.get   6
    v8x16.shuffle   0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
    i16x8.add
.Ltmp14:
    .loc    2 0 31                          # rgb2y-sample.cpp:0:31
    local.tee   6
.Ltmp15:
    #DEBUG_VALUE: rg2 <- undef
    #DEBUG_VALUE: rg2 <- %153
    .loc    2 72 15 is_stmt 1               # rgb2y-sample.cpp:72:15
    local.get   2
    i32.const   48
    i32.add
    v128.load   0
    local.get   5
    v8x16.shuffle   0, 1, 2, 16, 4, 5, 6, 16, 8, 9, 10, 16, 12, 13, 14, 16
.Ltmp16:
    .loc    2 77 62                         # rgb2y-sample.cpp:77:62
    local.tee   5
.Ltmp17:
    #DEBUG_VALUE: iv3 <- undef
    #DEBUG_VALUE: iv3 <- %98
    local.get   4
    v8x16.shuffle   0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23
    .loc    2 77 46 is_stmt 0               # rgb2y-sample.cpp:77:46
    local.get   7
    i16x8.mul
    .loc    2 77 146                        # rgb2y-sample.cpp:77:146
    local.tee   8
    local.get   5
    local.get   4
    v8x16.shuffle   8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31
    .loc    2 77 130                        # rgb2y-sample.cpp:77:130
    local.get   7
    i16x8.mul
    .loc    2 77 31                         # rgb2y-sample.cpp:77:31
    local.tee   4
    v8x16.shuffle   2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
    local.get   8
    local.get   4
    v8x16.shuffle   0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
    i16x8.add
.Ltmp18:
    .loc    2 80 33 is_stmt 1               # rgb2y-sample.cpp:80:33
    local.tee   4
.Ltmp19:
    #DEBUG_VALUE: rg3 <- undef
    #DEBUG_VALUE: rg3 <- %93
    v8x16.shuffle   2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
    local.get   6
    local.get   4
    v8x16.shuffle   0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
    i16x8.add
    i32.const   8
    .loc    2 80 18 is_stmt 0               # rgb2y-sample.cpp:80:18
    i16x8.shr_u
    .loc    2 81 21 is_stmt 1               # rgb2y-sample.cpp:81:21
    i8x16.narrow_i16x8_u
    .loc    2 81 19 is_stmt 0               # rgb2y-sample.cpp:81:19
    v128.store  0
.Ltmp20:
    #DEBUG_VALUE: rgba2y:i <- %170
    .loc    2 0 19                          # rgb2y-sample.cpp:0:19
    local.get   3
    local.tee   3
    local.set   2
.Ltmp21:
    .loc    2 68 11 is_stmt 1               # rgb2y-sample.cpp:68:11
    local.get   3
    i32.const   4
    i32.add
    local.tee   3
    i32.const   4
    .loc    2 68 14 is_stmt 0               # rgb2y-sample.cpp:68:14
    i32.shl
    .loc    2 68 30                         # rgb2y-sample.cpp:68:30
    local.get   1
    i32.le_u
.Ltmp22:
    .loc    2 68 2                          # rgb2y-sample.cpp:68:2
    br_if       0                               # 0: up to label1
.Ltmp23:
.LBB0_3:                                # %for.end
    end_loop
    end_block                               # label0:
    i32.const   0
.Ltmp24:
    .loc    2 84 2 is_stmt 1                # rgb2y-sample.cpp:84:2
                                        # fallthrough-return
    end_function
.Ltmp25:
.Lfunc_end0:
    .size   rgba2y, .Lfunc_end0-rgba2y
                                        # -- End function

Assembler / Assembly:

--- WebAssembly code ---
index: 2
kind: wasm function
compiler: TurboFan
Body (size = 1088 = 1086 + 2 padding)
Instructions (size = 1064)
0xa5976359180     0  55             push rbp
0xa5976359181     1  4889e5         REX.W movq rbp,rsp
0xa5976359184     4  6a0a           push 0xa
0xa5976359186     6  56             push rsi
0xa5976359187     7  4883ec58       REX.W subq rsp,0x58
0xa597635918b     b  488b5e17       REX.W movq rbx,[rsi+0x17]
0xa597635918f     f  83fa40         cmpl rdx,0x40
0xa5976359192    12  0f8307000000   jnc 0xa597635919f  <+0x1f>
0xa5976359198    18  33c9           xorl rcx,rcx
0xa597635919a    1a  e990030000     jmp 0xa597635952f  <+0x3af>
0xa597635919f    1f  b94d000000     movl rcx,0x4d
0xa59763591a4    24  c5f96ec1       vmovd xmm0,rcx
0xa59763591a8    28  c5fb70c000     vpshuflw xmm0,xmm0,0x0
0xa59763591ad    2d  c5f970c000     vpshufd xmm0,xmm0,0x0
0xa59763591b2    32  33c9           xorl rcx,rcx
0xa59763591b4    34  c5f96ec9       vmovd xmm1,rcx
0xa59763591b8    38  c4410057ff     vxorps xmm15,xmm15,xmm15
0xa59763591bd    3d  c4c27100cf     vpshufb xmm1,xmm1,xmm15
0xa59763591c2    42  bf96000000     movl rdi,0x96
0xa59763591c7    47  c5f9c4c701     vpinsrw xmm0,xmm0,rdi,0x1
0xa59763591cc    4c  bf80ffffff     movl rdi,0xffffff80
0xa59763591d1    51  c5f928d1       vmovapd xmm2,xmm1
0xa59763591d5    55  c4e36920d700   vpinsrb xmm2,xmm2,dil,0x0
0xa59763591db    5b  41b81d000000   movl r8,0x1d
0xa59763591e1    61  c4c179c4c002   vpinsrw xmm0,xmm0,r8,0x2
0xa59763591e7    67  c4e36920d701   vpinsrb xmm2,xmm2,dil,0x1
0xa59763591ed    6d  41b901000000   movl r9,0x1
0xa59763591f3    73  c4c179c4c103   vpinsrw xmm0,xmm0,r9,0x3
0xa59763591f9    79  c4e36920d702   vpinsrb xmm2,xmm2,dil,0x2
0xa59763591ff    7f  41bba0000000   movl r11,0xa0
0xa5976359205    85  c4c179c4c305   vpinsrw xmm0,xmm0,r11,0x5
0xa597635920b    8b  c4e36920d703   vpinsrb xmm2,xmm2,dil,0x3
0xa5976359211    91  c4c179c4c006   vpinsrw xmm0,xmm0,r8,0x6
0xa5976359217    97  c4c179c4c107   vpinsrw xmm0,xmm0,r9,0x7
0xa597635921d    9d  488bf9         REX.W movq rdi,rcx
0xa5976359220    a0  41b804000000   movl r8,0x4
0xa5976359226    a6  e90b000000     jmp 0xa5976359236  <+0xb6>
0xa597635922b    ab  0f1f440000     nop
0xa5976359230    b0  498bf8         REX.W movq rdi,r8
0xa5976359233    b3  4d8bc1         REX.W movq r8,r9
0xa5976359236    b6  4c8b4e2f       REX.W movq r9,[rsi+0x2f]
0xa597635923a    ba  493b21         REX.W cmpq rsp,[r9]
0xa597635923d    bd  0f86f4020000   jna 0xa5976359537  <+0x3b7>
0xa5976359243    c3  458d4804       leal r9,[r8+0x4]
0xa5976359247    c7  4d8bd9         REX.W movq r11,r9
0xa597635924a    ca  41c1e304       shll r11, 4
0xa597635924e    ce  8d3cb8         leal rdi,[rax+rdi*4]
0xa5976359251    d1  c5fa6f1c3b     vmovdqu xmm3,[rbx+rdi*1]
0xa5976359256    d6  c5fa6f641f10   vmovdqu xmm4,[rdi+rbx*1+0x10]
0xa597635925c    dc  c5fa6f6c1f20   vmovdqu xmm5,[rdi+rbx*1+0x20]
0xa5976359262    e2  c5fa6f741f30   vmovdqu xmm6,[rdi+rbx*1+0x30]
0xa5976359268    e8  c57810fe       vmovups xmm15,xmm6
0xa597635926c    ec  49ba0001028004050680 REX.W movq r10,0x8006050480020100
0xa5976359276    f6  c441f96ec2     vmovq xmm8,r10
0xa597635927b    fb  49ba08090a800c0d0e80 REX.W movq r10,0x800e0d0c800a0908
0xa5976359285   105  c443b922c201   vpinsrq xmm8,xmm8,r10,0x1
0xa597635928b   10b  c4420100f8     vpshufb xmm15,xmm15,xmm8
0xa5976359290   110  0f10fa         movups xmm7,xmm2
0xa5976359293   113  49ba8080800080808000 REX.W movq r10,0x80808000808080
0xa597635929d   11d  c441f96ec2     vmovq xmm8,r10
0xa59763592a2   122  4c8b15ecffffff REX.W movq r10,[rip+0xffffffec]
0xa59763592a9   129  c443b922c201   vpinsrq xmm8,xmm8,r10,0x1
0xa59763592af   12f  c4c24100f8     vpshufb xmm7,xmm7,xmm8
0xa59763592b4   134  c4c141ebff     vpor xmm7,xmm7,xmm15
0xa59763592b9   139  c57810fd       vmovups xmm15,xmm5
0xa59763592bd   13d  4c8b15aaffffff REX.W movq r10,[rip+0xffffffaa]
0xa59763592c4   144  c441f96ec2     vmovq xmm8,r10
0xa59763592c9   149  4c8b15adffffff REX.W movq r10,[rip+0xffffffad]
0xa59763592d0   150  c443b922c201   vpinsrq xmm8,xmm8,r10,0x1
0xa59763592d6   156  c4420100f8     vpshufb xmm15,xmm15,xmm8
0xa59763592db   15b  0f10f2         movups xmm6,xmm2
0xa59763592de   15e  4c8b15b0ffffff REX.W movq r10,[rip+0xffffffb0]
0xa59763592e5   165  c441f96ec2     vmovq xmm8,r10
0xa59763592ea   16a  4c8b15a4ffffff REX.W movq r10,[rip+0xffffffa4]
0xa59763592f1   171  c443b922c201   vpinsrq xmm8,xmm8,r10,0x1
0xa59763592f7   177  c4c24900f0     vpshufb xmm6,xmm6,xmm8
0xa59763592fc   17c  c4c149ebf7     vpor xmm6,xmm6,xmm15
0xa5976359301   181  c57810fc       vmovups xmm15,xmm4
0xa5976359305   185  4c8b1562ffffff REX.W movq r10,[rip+0xffffff62]
0xa597635930c   18c  c441f96ec2     vmovq xmm8,r10
0xa5976359311   191  4c8b1565ffffff REX.W movq r10,[rip+0xffffff65]
0xa5976359318   198  c443b922c201   vpinsrq xmm8,xmm8,r10,0x1
0xa597635931e   19e  c4420100f8     vpshufb xmm15,xmm15,xmm8
0xa5976359323   1a3  0f10ea         movups xmm5,xmm2
0xa5976359326   1a6  4c8b1568ffffff REX.W movq r10,[rip+0xffffff68]
0xa597635932d   1ad  c441f96ec2     vmovq xmm8,r10
0xa5976359332   1b2  4c8b155cffffff REX.W movq r10,[rip+0xffffff5c]
0xa5976359339   1b9  c443b922c201   vpinsrq xmm8,xmm8,r10,0x1
0xa597635933f   1bf  c4c25100e8     vpshufb xmm5,xmm5,xmm8
0xa5976359344   1c4  c4c151ebef     vpor xmm5,xmm5,xmm15
0xa5976359349   1c9  c57810fb       vmovups xmm15,xmm3
0xa597635934d   1cd  4c8b151affffff REX.W movq r10,[rip+0xffffff1a]
0xa5976359354   1d4  c441f96ec2     vmovq xmm8,r10
0xa5976359359   1d9  4c8b151dffffff REX.W movq r10,[rip+0xffffff1d]
0xa5976359360   1e0  c443b922c201   vpinsrq xmm8,xmm8,r10,0x1
0xa5976359366   1e6  c4420100f8     vpshufb xmm15,xmm15,xmm8
0xa597635936b   1eb  0f10e2         movups xmm4,xmm2
0xa597635936e   1ee  4c8b1520ffffff REX.W movq r10,[rip+0xffffff20]
0xa5976359375   1f5  c441f96ec2     vmovq xmm8,r10
0xa597635937a   1fa  4c8b1514ffffff REX.W movq r10,[rip+0xffffff14]
0xa5976359381   201  c443b922c201   vpinsrq xmm8,xmm8,r10,0x1
0xa5976359387   207  c4c25900e0     vpshufb xmm4,xmm4,xmm8
0xa597635938c   20c  c4c159ebe7     vpor xmm4,xmm4,xmm15
0xa5976359391   211  c5f928df       vmovapd xmm3,xmm7
0xa5976359395   215  c5e168d9       vpunpckhbw xmm3,xmm3,xmm1
0xa5976359399   219  c5c160f9       vpunpcklbw xmm7,xmm7,xmm1
0xa597635939d   21d  c57928c6       vmovapd xmm8,xmm6
0xa59763593a1   221  c53968c1       vpunpckhbw xmm8,xmm8,xmm1
0xa59763593a5   225  c5c960f1       vpunpcklbw xmm6,xmm6,xmm1
0xa59763593a9   229  c57928cd       vmovapd xmm9,xmm5
0xa59763593ad   22d  c53168c9       vpunpckhbw xmm9,xmm9,xmm1
0xa59763593b1   231  c5d160e9       vpunpcklbw xmm5,xmm5,xmm1
0xa59763593b5   235  c57928d4       vmovapd xmm10,xmm4
0xa59763593b9   239  c52968d1       vpunpckhbw xmm10,xmm10,xmm1
0xa59763593bd   23d  c5d960e1       vpunpcklbw xmm4,xmm4,xmm1
0xa59763593c1   241  c5e1d5d8       vpmullw xmm3,xmm3,xmm0
0xa59763593c5   245  c5c1d5f8       vpmullw xmm7,xmm7,xmm0
0xa59763593c9   249  c539d5c0       vpmullw xmm8,xmm8,xmm0
0xa59763593cd   24d  c5c9d5f0       vpmullw xmm6,xmm6,xmm0
0xa59763593d1   251  c531d5c8       vpmullw xmm9,xmm9,xmm0
0xa59763593d5   255  c5d1d5e8       vpmullw xmm5,xmm5,xmm0
0xa59763593d9   259  c529d5d0       vpmullw xmm10,xmm10,xmm0
0xa59763593dd   25d  c5d9d5e0       vpmullw xmm4,xmm4,xmm0
0xa59763593e1   261  c57928df       vmovapd xmm11,xmm7
0xa59763593e5   265  c44101efff     vpxor xmm15,xmm15,xmm15
0xa59763593ea   26a  c463010efb55   vpblendw xmm15,xmm15,xmm3,0x55
0xa59763593f0   270  c443210edfaa   vpblendw xmm11,xmm11,xmm15,0xaa
0xa59763593f6   276  c442212bdf     vpackusdw xmm11,xmm11,xmm15
0xa59763593fb   27b  c57810fb       vmovups xmm15,xmm3
0xa59763593ff   27f  c4c10172d710   vpsrld xmm15,xmm15,16
0xa5976359405   285  c5c172d710     vpsrld xmm7,xmm7,16
0xa597635940a   28a  c4c2412bff     vpackusdw xmm7,xmm7,xmm15
0xa597635940f   28f  c5f928de       vmovapd xmm3,xmm6
0xa5976359413   293  c44101efff     vpxor xmm15,xmm15,xmm15
0xa5976359418   298  c443010ef855   vpblendw xmm15,xmm15,xmm8,0x55
0xa597635941e   29e  c4c3610edfaa   vpblendw xmm3,xmm3,xmm15,0xaa
0xa5976359424   2a4  c4c2612bdf     vpackusdw xmm3,xmm3,xmm15
0xa5976359429   2a9  c4417810f8     vmovups xmm15,xmm8
0xa597635942e   2ae  c4c10172d710   vpsrld xmm15,xmm15,16
0xa5976359434   2b4  c5c972d610     vpsrld xmm6,xmm6,16
0xa5976359439   2b9  c4c2492bf7     vpackusdw xmm6,xmm6,xmm15
0xa597635943e   2be  c57928c5       vmovapd xmm8,xmm5
0xa5976359442   2c2  c44101efff     vpxor xmm15,xmm15,xmm15
0xa5976359447   2c7  c443010ef955   vpblendw xmm15,xmm15,xmm9,0x55
0xa597635944d   2cd  c443390ec7aa   vpblendw xmm8,xmm8,xmm15,0xaa
0xa5976359453   2d3  c442392bc7     vpackusdw xmm8,xmm8,xmm15
0xa5976359458   2d8  c4417810f9     vmovups xmm15,xmm9
0xa597635945d   2dd  c4c10172d710   vpsrld xmm15,xmm15,16
0xa5976359463   2e3  c5d172d510     vpsrld xmm5,xmm5,16
0xa5976359468   2e8  c4c2512bef     vpackusdw xmm5,xmm5,xmm15
0xa597635946d   2ed  c57928cc       vmovapd xmm9,xmm4
0xa5976359471   2f1  c44101efff     vpxor xmm15,xmm15,xmm15
0xa5976359476   2f6  c443010efa55   vpblendw xmm15,xmm15,xmm10,0x55
0xa597635947c   2fc  c443310ecfaa   vpblendw xmm9,xmm9,xmm15,0xaa
0xa5976359482   302  c442312bcf     vpackusdw xmm9,xmm9,xmm15
0xa5976359487   307  c4417810fa     vmovups xmm15,xmm10
0xa597635948c   30c  c4c10172d710   vpsrld xmm15,xmm15,16
0xa5976359492   312  c5d972d410     vpsrld xmm4,xmm4,16
0xa5976359497   317  c4c2592be7     vpackusdw xmm4,xmm4,xmm15
0xa597635949c   31c  c4c141fdfb     vpaddw xmm7,xmm7,xmm11
0xa59763594a1   321  c5c9fdf3       vpaddw xmm6,xmm6,xmm3
0xa59763594a5   325  c4c151fde8     vpaddw xmm5,xmm5,xmm8
0xa59763594aa   32a  c4c159fde1     vpaddw xmm4,xmm4,xmm9
0xa59763594af   32f  c5f928de       vmovapd xmm3,xmm6
0xa59763594b3   333  c44101efff     vpxor xmm15,xmm15,xmm15
0xa59763594b8   338  c463010eff55   vpblendw xmm15,xmm15,xmm7,0x55
0xa59763594be   33e  c4c3610edfaa   vpblendw xmm3,xmm3,xmm15,0xaa
0xa59763594c4   344  c4c2612bdf     vpackusdw xmm3,xmm3,xmm15
0xa59763594c9   349  c57810ff       vmovups xmm15,xmm7
0xa59763594cd   34d  c4c10172d710   vpsrld xmm15,xmm15,16
0xa59763594d3   353  c5c972d610     vpsrld xmm6,xmm6,16
0xa59763594d8   358  c4c2492bf7     vpackusdw xmm6,xmm6,xmm15
0xa59763594dd   35d  c5f928fc       vmovapd xmm7,xmm4
0xa59763594e1   361  c44101efff     vpxor xmm15,xmm15,xmm15
0xa59763594e6   366  c463010efd55   vpblendw xmm15,xmm15,xmm5,0x55
0xa59763594ec   36c  c4c3410effaa   vpblendw xmm7,xmm7,xmm15,0xaa
0xa59763594f2   372  c4c2412bff     vpackusdw xmm7,xmm7,xmm15
0xa59763594f7   377  c57810fd       vmovups xmm15,xmm5
0xa59763594fb   37b  c4c10172d710   vpsrld xmm15,xmm15,16
0xa5976359501   381  c5d972d410     vpsrld xmm4,xmm4,16
0xa5976359506   386  c4c2592be7     vpackusdw xmm4,xmm4,xmm15
0xa597635950b   38b  c5c9fdf3       vpaddw xmm6,xmm6,xmm3
0xa597635950f   38f  c5d9fde7       vpaddw xmm4,xmm4,xmm7
0xa5976359513   393  c5c971d608     vpsrlw xmm6,xmm6,8
0xa5976359518   398  c5d971d408     vpsrlw xmm4,xmm4,8
0xa597635951d   39d  c5d967e6       vpackuswb xmm4,xmm4,xmm6
0xa5976359521   3a1  c5fa7f243b     vmovdqu [rbx+rdi*1],xmm4
0xa5976359526   3a6  443bda         cmpl r11,rdx
0xa5976359529   3a9  0f8601fdffff   jna 0xa5976359230  <+0xb0>
0xa597635952f   3af  488bc1         REX.W movq rax,rcx
0xa5976359532   3b2  488be5         REX.W movq rsp,rbp
0xa5976359535   3b5  5d             pop rbp
0xa5976359536   3b6  c3             retl
0xa5976359537   3b7  488955e8       REX.W movq [rbp-0x18],rdx
0xa597635953b   3bb  48895de0       REX.W movq [rbp-0x20],rbx
0xa597635953f   3bf  c5f81145d0     vmovups [rbp-0x30],xmm0
0xa5976359544   3c4  c5f8114dc0     vmovups [rbp-0x40],xmm1
0xa5976359549   3c9  c5f81155b0     vmovups [rbp-0x50],xmm2
0xa597635954e   3ce  488945a8       REX.W movq [rbp-0x58],rax
0xa5976359552   3d2  48897da0       REX.W movq [rbp-0x60],rdi
0xa5976359556   3d6  4c894598       REX.W movq [rbp-0x68],r8
0xa597635955a   3da  e8615dffff     call 0xa597634f2c0       ;; wasm stub: WasmStackGuard
0xa597635955f   3df  33c9           xorl rcx,rcx
0xa5976359561   3e1  488b55e8       REX.W movq rdx,[rbp-0x18]
0xa5976359565   3e5  488b5de0       REX.W movq rbx,[rbp-0x20]
0xa5976359569   3e9  c5f81045d0     vmovups xmm0,[rbp-0x30]
0xa597635956e   3ee  c5f8104dc0     vmovups xmm1,[rbp-0x40]
0xa5976359573   3f3  c5f81055b0     vmovups xmm2,[rbp-0x50]
0xa5976359578   3f8  488b45a8       REX.W movq rax,[rbp-0x58]
0xa597635957c   3fc  488b7da0       REX.W movq rdi,[rbp-0x60]
0xa5976359580   400  4c8b4598       REX.W movq r8,[rbp-0x68]
0xa5976359584   404  488b75f0       REX.W movq rsi,[rbp-0x10]
0xa5976359588   408  e9b6fcffff     jmp 0xa5976359243  <+0xc3>
0xa597635958d   40d  e8fe5affff     call 0xa597634f090       ;; wasm stub: ThrowWasmTrapMemOutOfBounds
0xa5976359592   412  e8f95affff     call 0xa597634f090       ;; wasm stub: ThrowWasmTrapMemOutOfBounds
0xa5976359597   417  e8f45affff     call 0xa597634f090       ;; wasm stub: ThrowWasmTrapMemOutOfBounds
0xa597635959c   41c  e8ef5affff     call 0xa597634f090       ;; wasm stub: ThrowWasmTrapMemOutOfBounds
0xa59763595a1   421  e8ea5affff     call 0xa597634f090       ;; wasm stub: ThrowWasmTrapMemOutOfBounds
0xa59763595a6   426  90             nop
0xa59763595a7   427  90             nop

Protected instructions:
 pc offset  land pad
       3a1       40d
        e2       412
        dc       417
        d6       41c
        d1       421

Source positions:
 pc offset  position
        d1        43
        d6       239
        dc       416
        e2       545
       3a1       722
       3b7        29
       40d       722
       412       545
       417       416
       41c       239
       421        43

Safepoints (size = 22)
0xa5a7635917fffffffff  000000000000000 (sp -> fp)

RelocInfo (size = 8)
0xa597635955b  wasm stub call
0xa597635958e  wasm stub call
0xa5976359593  wasm stub call
0xa5976359598  wasm stub call
0xa597635959d  wasm stub call
0xa59763595a2  wasm stub call

--- End code ---

Yeah, that asm output looks like a disaster, including the part where it builds vector constants from `mov`-immediate and `vpinsrw` / `vpinsrb` instead of using 4 or 8 byte chunks. A normal ahead-of-time build with GCC or clang would put the vector constants in `.rodata` (not data). — Peter Cordes, Sep 13 '20 at 21:08
Thanks for the .rodata comment. I've fixed the comment above. Did you notice how it's doing floating point instructions even though there's no floating point operations? Shouldn't that be using pmovdqa instead? — Dan Weber, Sep 13 '20 at 21:15
BTW, your intrinsics could maybe be more efficient: I think you're basically hsumming some integer multiply results. You can't quite use SSSE3 `pmaddubsw` because you need some multipliers greater than 128 and the other side is variable and unpacked with zero. So neither side can be safely treated as signed 8-bit. But I think you can use SSE2 `pmaddwd` (_mm_maddubs_epi16) to multiply and hsum to 32-bit in one step. If you need to pack back down to 16-bit unsigned, though, `_mm_packus_epi32` needs SSE4.1, and has saturation instead of wrapping. So maybe that's a showstopper. — Peter Cordes, Sep 13 '20 at 21:15
It's actually unsigned 8 bit data. Except there was no instruction that would allow me to do it as part of wasm simd. — Dan Weber, Sep 13 '20 at 21:18
Oh and trust me, I looked for maddubs when implementing this. It would have been my preferred approach. — Dan Weber, Sep 13 '20 at 21:19
`vmovaps` / `vmovapd` for register copying is fine on modern CPUs between integer SIMD instructions, especially with mov-elimination. It doesn't actually do any floating-point things to your data. But `movaps` would cause a bypass delay on Nehalem because FP and integer-SIMD domains. V8's instruction choices seem to be on acid; choosing `vmovapd` sometimes, even a non-VEX `movups xmm7,xmm2` between two AVX1 instructions somewhere else. Yes, `vmovdqa` would be what you'd expect. — Peter Cordes, Sep 13 '20 at 21:20
Let us [continue this discussion in chat](https://chat.stackoverflow.com/rooms/221423/discussion-between-dan-weber-and-peter-cordes). — Dan Weber, Sep 13 '20 at 21:24
Yeah, I see you unpacking it with zeros so I know it's unsigned; that was my point about `pmaddubsw` not being usable because you need to treat both inputs as unsigned (because some of the multiplier values are too large for signed 8-bit), but that instruction treats 1 input as signed, the other as unsigned. Hmm, maybe it is possible to multiply twice with 2 constants that add up to the right total, and vertical add. e.g. `{77,150/2,29,1,77,160/2,29,1}` and `{0,150/2,0,0,0,160/2,0,0}`. That would save a bunch of shuffles for unpacking, and still be the same amount of multiplies I think. — Peter Cordes, Sep 13 '20 at 21:26
I'm not concerned with overflow. Wouldn't it make more sense to just create an haddu? I could do the same shuffles, and do an unsigned add. — Dan Weber, Sep 13 '20 at 21:37
`hadd` doesn't saturate, it's just normal binary addition that wraps on overflow. So it works for unsigned and 2's complement. The point of my suggestion was that `hadd` costs 3 uops: 2 shuffles and a vertical add, but `packssdw` or `packusdw` are single-uop 2-input shuffles. `pmaddubsw` or `pmaddwd` are single-uop instructions handled by a single ALU. The throughput bottleneck in your code is probably the shuffle port on Intel CPUs (1 shuffle uop per clock starting with Haswell, until Ice Lake added another vector shuffle port that can handle some). — Peter Cordes, Sep 13 '20 at 21:48
Also, is your `wasm_v8x16_shuffle` just zeroing every 4th byte, the alpha channel? (shuffle control = 16? I don't know what that means; that's not how `_mm_shuffle_epi8` works but I guess this is something else). That could be done more efficiently with an AND. Anyway, this is far off topic from the actual question about V8 making a total mess of your intrinsics. No idea about that, and possibly a showstopper. — Peter Cordes, Sep 13 '20 at 21:50
The Rounder is replacing every 4th byte with 128. I'm going to think about that padssdw suggestion. I'm not sure if I'll be able to use it, but I can certainly try if it's worthwhile. — Dan Weber, Sep 13 '20 at 21:54
Oh, that's probably what `vpblendw` is doing; replacing words (uint16_t) *after* unpacking with zero. If you instead zero it and simply `_mm_add_epi16` something to the multiply result, that could achieve the same thing. Especially if you delay that add until after a bunch of hadd steps so it's only one add per 4 input vectors. — Peter Cordes, Sep 13 '20 at 22:28
I've added a ticket for emscripten [here](https://github.com/emscripten-core/emscripten/issues/12224) for the .rodata issue. Hopefully this gets resolved upstream. As an ASM question regarding shuffles -- does pshufd suffer the same slow down you're suggesting for pshufb? — Dan Weber, Sep 15 '20 at 18:21
I'm not suggesting any particular slowdown for `vpshufb`, just that it can run on fewer ports than `vpand`. Both are single uop with 1c latency, but `vpand` can run on any of the 3 vector ALU ports, shuffles only on one. On modern Intel and AMD, `vpshufb` and `vpshufd` have identical performance. https://uops.info/ — Peter Cordes, Sep 15 '20 at 18:26

score 2 · Answer 1 · answered Sep 15 '20 at 20:42

2

Copying my answer from the Emscripten issue:

The reason we don't use v128.const for this is that v128.const was only recently implemented in V8. To avoid breaking origin trial users, we can't update LLVM to emit v128.const until the relevant V8 patches roll into Chrome stable. I'm keeping an eye on this dashboard to determine when will be a good time to make this change. If you're using a more recent build of Chrome or some other execution environment that does support v128.const, you can try compiling your project with the -munimplemented-simd128 flag, which will enable v128.const in LLVM (but might also introduce other changes that you don't want). Once v128.const is widely available, it will be better for LLVM to use v128.const than to load vectors from memory because that allows the engine to determine the best way to materialize vectors given the runtime platform.

It also might be worth considering porting performance-sensitive parts of your code to use the WebAssembly intrinsics header directly rather than relying on emulated SSE. That would reduce a layer of impedence mismatch between your code and the underlying machine code.

Finally, if you notice suboptimal instruction selection anywhere, it would be helpful if you could file LLVM bugs (if it's on the code -> wasm side) or V8 bugs (if it's on the wasm -> native side) about the specific issues you see. That kind of feedback is extremely valuable to us.

answered Sep 15 '20 at 20:42

tlively

116
3

1

Do you need someone to file bugs for the vector-constant disaster we can see in the OP's example? It it just expected that those `i16x8.replace_lane` WASM instructions get JITed into actual `vpinsrw` and `vpinsrb` instructions, instead of at least `mov reg, imm64` and `vmovq`? Maybe to work around that, the LLVM code that avoids `v128.const` could break constants down into 32 or 64-bit chunks instead of 8-bit, to hold V8's hand if it's not going to try to optimize a sequence of elements into a vector constant for you. – Peter Cordes Sep 15 '20 at 21:13
The C++ -> x86 asm in the question contains several different weird things, including that but also the odd choices of instruction for vector register copying. I'm not sure there are any actual slowdowns though, on modern x86 with mov-elimination. If you're going to mix AVX and legacy SSE encodings (with clean upper-128 lanes), `movups xmm, xmm` does save a byte of code size vs. `movdqa` but risks bypass latency between vec-int insns unless this is only when jitting for something with mov-elimination. AFAIK it is fine to use unaligned `movups` or `vmovups` between regs on CPUs with AVX. – Peter Cordes Sep 15 '20 at 21:18
1

@PeterCordes my comment on the ticket is -- if they have support for storing strings in .rodata, why not support loading the vectors from that in the mean time -- is it as optimal as a just in time determination? No. But it's gotta be better than this replace_lane stuff. – Dan Weber Sep 15 '20 at 22:48
1

It is expected that V8 translates replace_lane instructions literally to the corresponding native code without further pattern matching. In general for WebAssembly, as much codegen optimization as posssible is expected to be done by toolchains rather than engines. Using 32-bit or 64-bit chunks is a good idea. I won't be able to get to this soon, but the relevant code is [here](https://github.com/llvm/llvm-project/blob/c193a689b475f91e63adb25dc5855f7a7f068c9a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp#L1451-L1610). The replace_lanes are added right at the end. – tlively Sep 16 '20 at 22:24

score 0 · Accepted Answer · answered Sep 25 '20 at 02:57

@PeterCordes

I've made some changes to the LLVM implementation and am testing it now. What do you think of the generated ASM code for the constants? I think it's MUCH better, but would like a second opinion.

--- WebAssembly code ---
index: 3
kind: wasm function
compiler: TurboFan
Body (size = 1280 = 1278 + 2 padding)
Instructions (size = 1256)
0xfbcf162d3c0     0  55             push rbp
0xfbcf162d3c1     1  4889e5         REX.W movq rbp,rsp
0xfbcf162d3c4     4  6a0a           push 0xa
0xfbcf162d3c6     6  56             push rsi
0xfbcf162d3c7     7  4883ec50       REX.W subq rsp,0x50
0xfbcf162d3cb     b  488b4e17       REX.W movq rcx,[rsi+0x17]
0xfbcf162d3cf     f  488bd8         REX.W movq rbx,rax
0xfbcf162d3d2    12  83fa40         cmpl rdx,0x40
0xfbcf162d3d5    15  0f8308000000   jnc 0xfbcf162d3e3  <+0x23>
0xfbcf162d3db    1b  4533c0         xorl r8,r8
0xfbcf162d3de    1e  e950030000     jmp 0xfbcf162d733  <+0x373>
0xfbcf162d3e3    23  48bf8080808080808080 REX.W movq rdi,0x8080808080808080
0xfbcf162d3ed    2d  c4e1f96ec7     vmovq xmm0,rdi
0xfbcf162d3f2    32  c5fb12c0       vmovddup xmm0,xmm0
0xfbcf162d3f6    36  48bf4d0096001d000100 REX.W movq rdi,0x1001d0096004d
0xfbcf162d400    40  c4e1f96ecf     vmovq xmm1,rdi
0xfbcf162d405    45  c5fb12c9       vmovddup xmm1,xmm1
0xfbcf162d409    49  bf04000000     movl rdi,0x4
0xfbcf162d40e    4e  4533c9         xorl r9,r9
0xfbcf162d411    51  4c8bc7         REX.W movq r8,rdi
0xfbcf162d414    54  e90d000000     jmp 0xfbcf162d426  <+0x66>
0xfbcf162d419    59  0f1f8000000000 nop
0xfbcf162d420    60  4d8bc8         REX.W movq r9,r8
0xfbcf162d423    63  4d8bc3         REX.W movq r8,r11
0xfbcf162d426    66  4c8b5e2f       REX.W movq r11,[rsi+0x2f]
0xfbcf162d42a    6a  493b23         REX.W cmpq rsp,[r11]
0xfbcf162d42d    6d  0f86aa030000   jna 0xfbcf162d7dd  <+0x41d>
0xfbcf162d433    73  458d5804       leal r11,[r8+0x4]
0xfbcf162d437    77  4d8be3         REX.W movq r12,r11
0xfbcf162d43a    7a  41c1e404       shll r12, 4
0xfbcf162d43e    7e  468d0c8b       leal r9,[rbx+r9*4]
0xfbcf162d442    82  c4a17a6f1409   vmovdqu xmm2,[rcx+r9*1]
0xfbcf162d448    88  c4c17a6f5c0910 vmovdqu xmm3,[r9+rcx*1+0x10]
0xfbcf162d44f    8f  c4c17a6f640920 vmovdqu xmm4,[r9+rcx*1+0x20]
0xfbcf162d456    96  c4c17a6f6c0930 vmovdqu xmm5,[r9+rcx*1+0x30]
0xfbcf162d45d    9d  c57810fd       vmovups xmm15,xmm5
0xfbcf162d461    a1  49ba0001028004050680 REX.W movq r10,0x8006050480020100
0xfbcf162d46b    ab  c4c1f96efa     vmovq xmm7,r10
0xfbcf162d470    b0  49ba08090a800c0d0e80 REX.W movq r10,0x800e0d0c800a0908
0xfbcf162d47a    ba  c4c3c122fa01   vpinsrq xmm7,xmm7,r10,0x1
0xfbcf162d480    c0  c4620100ff     vpshufb xmm15,xmm15,xmm7
0xfbcf162d485    c5  0f10f0         movups xmm6,xmm0
0xfbcf162d488    c8  49ba8080800080808000 REX.W movq r10,0x80808000808080
0xfbcf162d492    d2  c4c1f96efa     vmovq xmm7,r10
0xfbcf162d497    d7  4c8b15ecffffff REX.W movq r10,[rip+0xffffffec]
0xfbcf162d49e    de  c4c3c122fa01   vpinsrq xmm7,xmm7,r10,0x1
0xfbcf162d4a4    e4  c4e24900f7     vpshufb xmm6,xmm6,xmm7
0xfbcf162d4a9    e9  c4c149ebf7     vpor xmm6,xmm6,xmm15
0xfbcf162d4ae    ee  c57810fc       vmovups xmm15,xmm4
0xfbcf162d4b2    f2  4c8b15aaffffff REX.W movq r10,[rip+0xffffffaa]
0xfbcf162d4b9    f9  c4c1f96efa     vmovq xmm7,r10
0xfbcf162d4be    fe  4c8b15adffffff REX.W movq r10,[rip+0xffffffad]
0xfbcf162d4c5   105  c4c3c122fa01   vpinsrq xmm7,xmm7,r10,0x1
0xfbcf162d4cb   10b  c4620100ff     vpshufb xmm15,xmm15,xmm7
0xfbcf162d4d0   110  0f10e8         movups xmm5,xmm0
0xfbcf162d4d3   113  4c8b15b0ffffff REX.W movq r10,[rip+0xffffffb0]
0xfbcf162d4da   11a  c4c1f96efa     vmovq xmm7,r10
0xfbcf162d4df   11f  4c8b15a4ffffff REX.W movq r10,[rip+0xffffffa4]
0xfbcf162d4e6   126  c4c3c122fa01   vpinsrq xmm7,xmm7,r10,0x1
0xfbcf162d4ec   12c  c4e25100ef     vpshufb xmm5,xmm5,xmm7
0xfbcf162d4f1   131  c4c151ebef     vpor xmm5,xmm5,xmm15
0xfbcf162d4f6   136  c57810fb       vmovups xmm15,xmm3
0xfbcf162d4fa   13a  4c8b1562ffffff REX.W movq r10,[rip+0xffffff62]
0xfbcf162d501   141  c4c1f96efa     vmovq xmm7,r10
0xfbcf162d506   146  4c8b1565ffffff REX.W movq r10,[rip+0xffffff65]
0xfbcf162d50d   14d  c4c3c122fa01   vpinsrq xmm7,xmm7,r10,0x1
0xfbcf162d513   153  c4620100ff     vpshufb xmm15,xmm15,xmm7
0xfbcf162d518   158  0f10e0         movups xmm4,xmm0
0xfbcf162d51b   15b  4c8b1568ffffff REX.W movq r10,[rip+0xffffff68]
0xfbcf162d522   162  c4c1f96efa     vmovq xmm7,r10
0xfbcf162d527   167  4c8b155cffffff REX.W movq r10,[rip+0xffffff5c]
0xfbcf162d52e   16e  c4c3c122fa01   vpinsrq xmm7,xmm7,r10,0x1
0xfbcf162d534   174  c4e25900e7     vpshufb xmm4,xmm4,xmm7
0xfbcf162d539   179  c4c159ebe7     vpor xmm4,xmm4,xmm15
0xfbcf162d53e   17e  c57810fa       vmovups xmm15,xmm2
0xfbcf162d542   182  4c8b151affffff REX.W movq r10,[rip+0xffffff1a]
0xfbcf162d549   189  c4c1f96efa     vmovq xmm7,r10
0xfbcf162d54e   18e  4c8b151dffffff REX.W movq r10,[rip+0xffffff1d]
0xfbcf162d555   195  c4c3c122fa01   vpinsrq xmm7,xmm7,r10,0x1
0xfbcf162d55b   19b  c4620100ff     vpshufb xmm15,xmm15,xmm7
0xfbcf162d560   1a0  0f10d8         movups xmm3,xmm0
0xfbcf162d563   1a3  4c8b1520ffffff REX.W movq r10,[rip+0xffffff20]
0xfbcf162d56a   1aa  c4c1f96efa     vmovq xmm7,r10
0xfbcf162d56f   1af  4c8b1514ffffff REX.W movq r10,[rip+0xffffff14]
0xfbcf162d576   1b6  c4c3c122fa01   vpinsrq xmm7,xmm7,r10,0x1
0xfbcf162d57c   1bc  c4e26100df     vpshufb xmm3,xmm3,xmm7
0xfbcf162d581   1c1  c4c161ebdf     vpor xmm3,xmm3,xmm15
0xfbcf162d586   1c6  c4e3690fd608   vpalignr xmm2,xmm2,xmm6,0x8
0xfbcf162d58c   1cc  c4e27930d2     vpmovzxbw xmm2,xmm2
0xfbcf162d591   1d1  c4e27930f6     vpmovzxbw xmm6,xmm6
0xfbcf162d596   1d6  c4e3410ffd08   vpalignr xmm7,xmm7,xmm5,0x8
0xfbcf162d59c   1dc  c4e27930ff     vpmovzxbw xmm7,xmm7
0xfbcf162d5a1   1e1  c4e27930ed     vpmovzxbw xmm5,xmm5
0xfbcf162d5a6   1e6  c463390fc408   vpalignr xmm8,xmm8,xmm4,0x8
0xfbcf162d5ac   1ec  c4427930c0     vpmovzxbw xmm8,xmm8
0xfbcf162d5b1   1f1  c4e27930e4     vpmovzxbw xmm4,xmm4
0xfbcf162d5b6   1f6  c463310fcb08   vpalignr xmm9,xmm9,xmm3,0x8
0xfbcf162d5bc   1fc  c4427930c9     vpmovzxbw xmm9,xmm9
0xfbcf162d5c1   201  c4e27930db     vpmovzxbw xmm3,xmm3
0xfbcf162d5c6   206  c5e9d5d1       vpmullw xmm2,xmm2,xmm1
0xfbcf162d5ca   20a  c5c9d5f1       vpmullw xmm6,xmm6,xmm1
0xfbcf162d5ce   20e  c5c1d5f9       vpmullw xmm7,xmm7,xmm1
0xfbcf162d5d2   212  c5d1d5e9       vpmullw xmm5,xmm5,xmm1
0xfbcf162d5d6   216  c539d5c1       vpmullw xmm8,xmm8,xmm1
0xfbcf162d5da   21a  c5d9d5e1       vpmullw xmm4,xmm4,xmm1
0xfbcf162d5de   21e  c531d5c9       vpmullw xmm9,xmm9,xmm1
0xfbcf162d5e2   222  c5e1d5d9       vpmullw xmm3,xmm3,xmm1
0xfbcf162d5e6   226  c57928d6       vmovapd xmm10,xmm6
0xfbcf162d5ea   22a  c44101efff     vpxor xmm15,xmm15,xmm15
0xfbcf162d5ef   22f  c463010efa55   vpblendw xmm15,xmm15,xmm2,0x55
0xfbcf162d5f5   235  c443290ed7aa   vpblendw xmm10,xmm10,xmm15,0xaa
0xfbcf162d5fb   23b  c442292bd7     vpackusdw xmm10,xmm10,xmm15
0xfbcf162d600   240  c57810fa       vmovups xmm15,xmm2
0xfbcf162d604   244  c4c10172d710   vpsrld xmm15,xmm15,16
0xfbcf162d60a   24a  c5c972d610     vpsrld xmm6,xmm6,16
0xfbcf162d60f   24f  c4c2492bf7     vpackusdw xmm6,xmm6,xmm15
0xfbcf162d614   254  c5f928d5       vmovapd xmm2,xmm5
0xfbcf162d618   258  c44101efff     vpxor xmm15,xmm15,xmm15
0xfbcf162d61d   25d  c463010eff55   vpblendw xmm15,xmm15,xmm7,0x55
0xfbcf162d623   263  c4c3690ed7aa   vpblendw xmm2,xmm2,xmm15,0xaa
0xfbcf162d629   269  c4c2692bd7     vpackusdw xmm2,xmm2,xmm15
0xfbcf162d62e   26e  c57810ff       vmovups xmm15,xmm7
0xfbcf162d632   272  c4c10172d710   vpsrld xmm15,xmm15,16
0xfbcf162d638   278  c5d172d510     vpsrld xmm5,xmm5,16
0xfbcf162d63d   27d  c4c2512bef     vpackusdw xmm5,xmm5,xmm15
0xfbcf162d642   282  c5f928fc       vmovapd xmm7,xmm4
0xfbcf162d646   286  c44101efff     vpxor xmm15,xmm15,xmm15
0xfbcf162d64b   28b  c443010ef855   vpblendw xmm15,xmm15,xmm8,0x55
0xfbcf162d651   291  c4c3410effaa   vpblendw xmm7,xmm7,xmm15,0xaa
0xfbcf162d657   297  c4c2412bff     vpackusdw xmm7,xmm7,xmm15
0xfbcf162d65c   29c  c4417810f8     vmovups xmm15,xmm8
0xfbcf162d661   2a1  c4c10172d710   vpsrld xmm15,xmm15,16
0xfbcf162d667   2a7  c5d972d410     vpsrld xmm4,xmm4,16
0xfbcf162d66c   2ac  c4c2592be7     vpackusdw xmm4,xmm4,xmm15
0xfbcf162d671   2b1  c57928c3       vmovapd xmm8,xmm3
0xfbcf162d675   2b5  c44101efff     vpxor xmm15,xmm15,xmm15
0xfbcf162d67a   2ba  c443010ef955   vpblendw xmm15,xmm15,xmm9,0x55
0xfbcf162d680   2c0  c443390ec7aa   vpblendw xmm8,xmm8,xmm15,0xaa
0xfbcf162d686   2c6  c442392bc7     vpackusdw xmm8,xmm8,xmm15
0xfbcf162d68b   2cb  c4417810f9     vmovups xmm15,xmm9
0xfbcf162d690   2d0  c4c10172d710   vpsrld xmm15,xmm15,16
0xfbcf162d696   2d6  c5e172d310     vpsrld xmm3,xmm3,16
0xfbcf162d69b   2db  c4c2612bdf     vpackusdw xmm3,xmm3,xmm15
0xfbcf162d6a0   2e0  c4c149fdf2     vpaddw xmm6,xmm6,xmm10
0xfbcf162d6a5   2e5  c5d1fdea       vpaddw xmm5,xmm5,xmm2
0xfbcf162d6a9   2e9  c5d9fde7       vpaddw xmm4,xmm4,xmm7
0xfbcf162d6ad   2ed  c4c161fdd8     vpaddw xmm3,xmm3,xmm8
0xfbcf162d6b2   2f2  c5f928d5       vmovapd xmm2,xmm5
0xfbcf162d6b6   2f6  c44101efff     vpxor xmm15,xmm15,xmm15
0xfbcf162d6bb   2fb  c463010efe55   vpblendw xmm15,xmm15,xmm6,0x55
0xfbcf162d6c1   301  c4c3690ed7aa   vpblendw xmm2,xmm2,xmm15,0xaa
0xfbcf162d6c7   307  c4c2692bd7     vpackusdw xmm2,xmm2,xmm15
0xfbcf162d6cc   30c  c57810fe       vmovups xmm15,xmm6
0xfbcf162d6d0   310  c4c10172d710   vpsrld xmm15,xmm15,16
0xfbcf162d6d6   316  c5d172d510     vpsrld xmm5,xmm5,16
0xfbcf162d6db   31b  c4c2512bef     vpackusdw xmm5,xmm5,xmm15
0xfbcf162d6e0   320  c5f928f3       vmovapd xmm6,xmm3
0xfbcf162d6e4   324  c44101efff     vpxor xmm15,xmm15,xmm15
0xfbcf162d6e9   329  c463010efc55   vpblendw xmm15,xmm15,xmm4,0x55
0xfbcf162d6ef   32f  c4c3490ef7aa   vpblendw xmm6,xmm6,xmm15,0xaa
0xfbcf162d6f5   335  c4c2492bf7     vpackusdw xmm6,xmm6,xmm15
0xfbcf162d6fa   33a  c57810fc       vmovups xmm15,xmm4
0xfbcf162d6fe   33e  c4c10172d710   vpsrld xmm15,xmm15,16
0xfbcf162d704   344  c5e172d310     vpsrld xmm3,xmm3,16
0xfbcf162d709   349  c4c2612bdf     vpackusdw xmm3,xmm3,xmm15
0xfbcf162d70e   34e  c5d1fdea       vpaddw xmm5,xmm5,xmm2
0xfbcf162d712   352  c5e1fdde       vpaddw xmm3,xmm3,xmm6
0xfbcf162d716   356  c5d171d508     vpsrlw xmm5,xmm5,8
0xfbcf162d71b   35b  c5e171d308     vpsrlw xmm3,xmm3,8
0xfbcf162d720   360  c5e167dd       vpackuswb xmm3,xmm3,xmm5
0xfbcf162d724   364  c4a17a7f1c09   vmovdqu [rcx+r9*1],xmm3
0xfbcf162d72a   36a  443be2         cmpl r12,rdx
0xfbcf162d72d   36d  0f86edfcffff   jna 0xfbcf162d420  <+0x60>
0xfbcf162d733   373  33ff           xorl rdi,rdi
0xfbcf162d735   375  41b904000000   movl r9,0x4
0xfbcf162d73b   37b  4183f9ff       cmpl r9,0xff
0xfbcf162d73f   37f  0f84e7000000   jz 0xfbcf162d82c  <+0x46c>
0xfbcf162d745   385  41c1e004       shll r8, 4
0xfbcf162d749   389  488bc2         REX.W movq rax,rdx
0xfbcf162d74c   38c  99             cdql
0xfbcf162d74d   38d  41f7f9         idivl r9
0xfbcf162d750   390  428d1403       leal rdx,[rbx+r8*1]
0xfbcf162d754   394  03d8           addl rbx,rax
0xfbcf162d756   396  3bd3           cmpl rdx,rbx
0xfbcf162d758   398  0f8777000000   ja 0xfbcf162d7d5  <+0x415>
0xfbcf162d75e   39e  4c8bc7         REX.W movq r8,rdi
0xfbcf162d761   3a1  4c8bca         REX.W movq r9,rdx
0xfbcf162d764   3a4  e90d000000     jmp 0xfbcf162d776  <+0x3b6>
0xfbcf162d769   3a9  0f1f8000000000 nop
0xfbcf162d770   3b0  4d8bc3         REX.W movq r8,r11
0xfbcf162d773   3b3  4d89e1         REX.W movq r9,r12
0xfbcf162d776   3b6  4c8b5e2f       REX.W movq r11,[rsi+0x2f]
0xfbcf162d77a   3ba  493b23         REX.W cmpq rsp,[r11]
0xfbcf162d77d   3bd  0f86ba000000   jna 0xfbcf162d83d  <+0x47d>
0xfbcf162d783   3c3  458d5804       leal r11,[r8+0x4]
0xfbcf162d787   3c7  468d241a       leal r12,[rdx+r11*1]
0xfbcf162d78b   3cb  4d8bf0         REX.W movq r14,r8
0xfbcf162d78e   3ce  4183ce01       orl r14,0x1
0xfbcf162d792   3d2  458bc9         movl r9,r9
0xfbcf162d795   3d5  4403f2         addl r14,rdx
0xfbcf162d798   3d8  4183c802       orl r8,0x2
0xfbcf162d79c   3dc  460fb63c09     movzxbl r15,[rcx+r9*1]
0xfbcf162d7a1   3e1  4403c2         addl r8,rdx
0xfbcf162d7a4   3e4  460fb63431     movzxbl r14,[rcx+r14*1]
0xfbcf162d7a9   3e9  460fb60401     movzxbl r8,[rcx+r8*1]
0xfbcf162d7ae   3ee  4569f696000000 imull r14,r14,0x96
0xfbcf162d7b5   3f5  456bff4d       imull r15,r15,0x4d
0xfbcf162d7b9   3f9  456bc01d       imull r8,r8,0x1d
0xfbcf162d7bd   3fd  4503f7         addl r14,r15
0xfbcf162d7c0   400  478d843080000000 leal r8,[r8+r14*1+0x80]
0xfbcf162d7c8   408  41c1e808       shrl r8, 8
0xfbcf162d7cc   40c  46880409       movb [rcx+r9*1],r8l
0xfbcf162d7d0   410  443be3         cmpl r12,rbx
0xfbcf162d7d3   413  769b           jna 0xfbcf162d770  <+0x3b0>
0xfbcf162d7d5   415  488bc7         REX.W movq rax,rdi
0xfbcf162d7d8   418  488be5         REX.W movq rsp,rbp
0xfbcf162d7db   41b  5d             pop rbp
0xfbcf162d7dc   41c  c3             retl
0xfbcf162d7dd   41d  48894de0       REX.W movq [rbp-0x20],rcx
0xfbcf162d7e1   421  48895de8       REX.W movq [rbp-0x18],rbx
0xfbcf162d7e5   425  488955d8       REX.W movq [rbp-0x28],rdx
0xfbcf162d7e9   429  4c8945d0       REX.W movq [rbp-0x30],r8
0xfbcf162d7ed   42d  c5f8114db0     vmovups [rbp-0x50],xmm1
0xfbcf162d7f2   432  c5f81145a0     vmovups [rbp-0x60],xmm0
0xfbcf162d7f7   437  4c894dc8       REX.W movq [rbp-0x38],r9
0xfbcf162d7fb   43b  e8c04affff     call 0xfbcf16222c0       ;; wasm stub: WasmStackGuard
0xfbcf162d800   440  488b4de0       REX.W movq rcx,[rbp-0x20]
0xfbcf162d804   444  488b75f0       REX.W movq rsi,[rbp-0x10]
0xfbcf162d808   448  488b5de8       REX.W movq rbx,[rbp-0x18]
0xfbcf162d80c   44c  bf04000000     movl rdi,0x4
0xfbcf162d811   451  488b55d8       REX.W movq rdx,[rbp-0x28]
0xfbcf162d815   455  4c8b45d0       REX.W movq r8,[rbp-0x30]
0xfbcf162d819   459  c5f8104db0     vmovups xmm1,[rbp-0x50]
0xfbcf162d81e   45e  c5f81045a0     vmovups xmm0,[rbp-0x60]
0xfbcf162d823   463  4c8b4dc8       REX.W movq r9,[rbp-0x38]
0xfbcf162d827   467  e907fcffff     jmp 0xfbcf162d433  <+0x73>
0xfbcf162d82c   46c  81fa00000080   cmpl rdx,0x80000000
0xfbcf162d832   472  0f843d000000   jz 0xfbcf162d875  <+0x4b5>
0xfbcf162d838   478  e908ffffff     jmp 0xfbcf162d745  <+0x385>
0xfbcf162d83d   47d  48895de8       REX.W movq [rbp-0x18],rbx
0xfbcf162d841   481  48894de0       REX.W movq [rbp-0x20],rcx
0xfbcf162d845   485  488955d8       REX.W movq [rbp-0x28],rdx
0xfbcf162d849   489  4c8945d0       REX.W movq [rbp-0x30],r8
0xfbcf162d84d   48d  4c894dc8       REX.W movq [rbp-0x38],r9
0xfbcf162d851   491  e86a4affff     call 0xfbcf16222c0       ;; wasm stub: WasmStackGuard
0xfbcf162d856   496  33ff           xorl rdi,rdi
0xfbcf162d858   498  488b5de8       REX.W movq rbx,[rbp-0x18]
0xfbcf162d85c   49c  488b4de0       REX.W movq rcx,[rbp-0x20]
0xfbcf162d860   4a0  488b55d8       REX.W movq rdx,[rbp-0x28]
0xfbcf162d864   4a4  4c8b45d0       REX.W movq r8,[rbp-0x30]
0xfbcf162d868   4a8  4c8b4dc8       REX.W movq r9,[rbp-0x38]
0xfbcf162d86c   4ac  488b75f0       REX.W movq rsi,[rbp-0x10]
0xfbcf162d870   4b0  e90effffff     jmp 0xfbcf162d783  <+0x3c3>
0xfbcf162d875   4b5  e84648ffff     call 0xfbcf16220c0       ;; wasm stub: ThrowWasmTrapDivUnrepresentable
0xfbcf162d87a   4ba  e81148ffff     call 0xfbcf1622090       ;; wasm stub: ThrowWasmTrapMemOutOfBounds
0xfbcf162d87f   4bf  e80c48ffff     call 0xfbcf1622090       ;; wasm stub: ThrowWasmTrapMemOutOfBounds
0xfbcf162d884   4c4  e80748ffff     call 0xfbcf1622090       ;; wasm stub: ThrowWasmTrapMemOutOfBounds
0xfbcf162d889   4c9  e80248ffff     call 0xfbcf1622090       ;; wasm stub: ThrowWasmTrapMemOutOfBounds
0xfbcf162d88e   4ce  e8fd47ffff     call 0xfbcf1622090       ;; wasm stub: ThrowWasmTrapMemOutOfBounds
0xfbcf162d893   4d3  e8f847ffff     call 0xfbcf1622090       ;; wasm stub: ThrowWasmTrapMemOutOfBounds
0xfbcf162d898   4d8  e8f347ffff     call 0xfbcf1622090       ;; wasm stub: ThrowWasmTrapMemOutOfBounds
0xfbcf162d89d   4dd  e8ee47ffff     call 0xfbcf1622090       ;; wasm stub: ThrowWasmTrapMemOutOfBounds
0xfbcf162d8a2   4e2  e8e947ffff     call 0xfbcf1622090       ;; wasm stub: ThrowWasmTrapMemOutOfBounds
0xfbcf162d8a7   4e7  90             nop

Yes, that's much better for constant construction. Still not as good as broadcast-loading them from memory with a RIP-relative addressing mode or with a pointer arg pointing to some place there JIT put the constants. Also, `0x8080808080808080` could be done with a 4-byte broadcast, avoiding a bulky `mov r64, imm64`. (Use AVX1 `vpshufd xmm0, xmm0, 0` or AVX2 `vpbroadcastd` instead of `vmovddup`). Also, `vmovddup` is an FP shuffle; a better choice to broadcast the low element of a *register* on some CPUs might be `vpunpcklqdq`. Runs on more ports on Sandybridge, and could avoid bypass latency — Peter Cordes, Sep 25 '20 at 03:39
Also for that `0x80...` bit-pattern specifically, it can be constructed on the fly in 3 instructions: `vpcmpeqd xmm0,xmm0` / `vpabsb xmm0,xmm0` / `vpslld xmm0, 7`. If you're not going to load it from memory, that's a slightly more efficient way to construct it than mov-imm64 / movq / vmovddup. [What are the best instruction sequences to generate vector constants on the fly?](https://stackoverflow.com/q/35085059) — Peter Cordes, Sep 25 '20 at 03:46
`movq r10,[rip+0xffffffec]` / `vpinsrq xmm7,xmm7,r10,0x1` look really silly. A memory-source `vpinsrq` is *much* better than a GP-integer register source. A register source costs an integer->xmm transfer uop (port 5) plus another port 5 uop to merge. But a memory source is a load uop + a port 5 uop. So the separate load into R10 is just pure downside in total uops, port 5 bottleneck, and code size. Also, `0xffffffec` is only `-20`, so it's loading data from a previous instruction, specifically the `0x80808000808080` that's already in xmm7 on that path of execution, so just `vpunpcklqdq`. — Peter Cordes, Sep 25 '20 at 03:55
Also, a `pshufb` shuffle-mask of `set1_epi64(0x80808000808080)` is just copying the low byte to bytes 4 and 12, and zeroing everything else. That seems weird, like something you could do more easily a different way. If constructing mask constants is this inefficient, you could `psllq xmm, 56` / `psrld xmm, 24` to shift the original byte 0 to byte 4, zeroing the rest of the low qword. Then replicate it with `puncpklqdq xmm, xmm`. But if you can create that mask constant non-horribly (e.g. on RIP-relative `vmovddup` broadcast-load), `vpshufb` is better. — Peter Cordes, Sep 25 '20 at 04:04
I think you'll like this solution better. https://gist.github.com/omnisip/f7ad4f886b91911884840883639a427c — Dan Weber, Sep 25 '20 at 04:06
`movups xmm6,xmm0` is wasted: It's just setting up for `vpshufb xmm6,xmm6,xmm7` which should have been `vpshufb xmm6,xmm0,xmm7` instead. So yeah, this code is still full of obvious (and not-so-obvious) missed optimizations. — Peter Cordes, Sep 25 '20 at 04:06
Once I got rid of the shuffles at the beginning of the function, I was able to get rid of them all together. I add the rounding numbers in 2 adds at the very end, and it removes all of pshufb/pinsr nonsense. — Dan Weber, Sep 25 '20 at 04:11
`vpalignr xmm6,xmm6,xmm5,0x8` in that gist is reading XMM6 without having written it. I assume it's just picking a dummy register to read from? Don't introduce a potential false dependency. But you only care about the low half of the result (for `vpmovzxbw`) so actually just use a byte shift that shifts in zeros, `vpsrldq xmm6, xmm5, 8`. Or even better, unpack the high half of the original vector with zeros by zeroing a vector `vpxor xmm7, xmm7, xmm7` once for the whole function, then [`vpunpckhbw xmm6, xmm5, xmm7`](//www.felixcloutier.com/x86/punpckhbw:punpckhwd:punpckhdq:punpckhqdq) — Peter Cordes, Sep 25 '20 at 04:14
I'm working with what I have here -- https://github.com/WebAssembly/simd/blob/master/proposals/simd/SIMD.md -- but I might be able to do a load and extend if that's better than what it's currently doing... — Dan Weber, Sep 25 '20 at 04:20
@PeterCordes Here's one that directly unpacks from memory. https://gist.github.com/omnisip/9b6ee85a42230c3697c69e2ac6a0f1a4 I think it's a bit slower than the previous version believe it or not. — Dan Weber, Sep 25 '20 at 04:54
With a simple addressing mode (1 register), `vpmovzxbw xmm, [mem]` can micro-fuse into 1 uop. But with an indexed addressing mode, it un-laminates into 2 uops at issue/rename. But to take advantage of that, you'd have to increment 8 pointers separately. So separate load + `vpunpcklbw` / `vpunpckhbw` against zero is probably your best bet inside the loop. Also, there's a bunch of pointless looking integer work storing and reloading stuff, including RAX right before using it as a byte offset. — Peter Cordes, Sep 25 '20 at 05:09
Instructions like `vpblendw xmm15,xmm15,xmm9,0x55` are blending with a zeroed register. On Sandybridge-family including Skylake, `vpblendw` can only run on port 5, so that's bad for port 5 (shuffle port) pressure compared to an AND mask, keeping a couple more vector constants in registers, if you have enough regs. xmm1 appears unused inside the loop. Letting out-of-order exec + register renaming do its job would let you reuse some registers, e.g. by doing some of the `vpmozxbw` loads later. — Peter Cordes, Sep 25 '20 at 05:14
It's part of a sequence like xor-zero xmm15, `vpblendw xmm15,xmm15,xmm7,0x55` / `vpblendw xmm9,xmm9,xmm15,0xaa` / `vpackusdw xmm9,xmm9,xmm15`. I think it's really just blending xmm7 and xmm9 together 2 different ways with zeros and then packing to feed a `vpaddw`. It's super weird that the 2nd blend uses zeros (I think) from the first, rather than both independently reading zeros from XMM15. And yes, that should either be two `vpand`s to feed the pack instruction, or find some way to use `phaddw` to combine word elements that way instead of zero odd/even and pack. — Peter Cordes, Sep 25 '20 at 05:24

How can I force emscripten/em++/llvm to load constants from .rodata and/or perform better SIMD optimization?

2 Answers2