GCC accessing thread local storage unnecessarily?

I'm running down some numeric difference between GCC/ICC and I've noticed something puzzling. When compiling this function (which computes an approximate natural logarithm):

// convenience macros for defining constant vectors

#define MM256_ICONST(val) (__m256)_mm256_set1_epi32(val);

#define MM256_FCONST(val)         _mm256_set1_ps   (val);



__m256 _mm256_logf_app(__m256 xx) {

    // constants

    const __m256 mant_mask = MM256_ICONST(0x007FFFFF); // mantissa mask

    const __m256  exp_mask = MM256_ICONST(0x3F800000); // 127 in exponent

    const __m256      mln2 = MM256_FCONST(M_LN2);      // log(2)



    // polynomial coefficients

    const __m256 a = MM256_FCONST(+3.529304993f);

    const __m256 b = MM256_FCONST(-2.461222105f);

    const __m256 c = MM256_FCONST(+1.130626167f);

    const __m256 d = MM256_FCONST(-0.288739945f);

    const __m256 e = MM256_FCONST(+3.110401639e-2f);

    const __m256 f = MM256_FCONST(-89.970756366f);



    // mask out anything <= 0

    __m256 invalid = _mm256_cmp_ps(xx, _mm256_setzero_ps(), _CMP_LE_OQ);



    // extract exponents

    __m256 exp = _mm256_cvtepi32_ps(

        avx2_mm256_srli_epi32((__m256i)xx, 23)

    );



    // clear exponent to 0 (+127)

    xx = _mm256_and_ps(xx, mant_mask);

    xx = _mm256_or_ps (xx,  exp_mask);



    // horner's rule to evaluate polynomial

    __m256 ret = e;

    ret = _mm256_add_ps(d, _mm256_mul_ps(ret,xx));

    ret = _mm256_add_ps(c, _mm256_mul_ps(ret,xx));

    ret = _mm256_add_ps(b, _mm256_mul_ps(ret,xx));

    ret = _mm256_add_ps(a, _mm256_mul_ps(ret,xx));

    ret = _mm256_add_ps(f, _mm256_mul_ps(ret,xx));



    // add in exponent contribution

    ret = _mm256_add_ps(ret,

        _mm256_mul_ps(exp, mln2)

    );



    return _mm256_or_ps(ret, invalid);

}

It generates this assembly:

   0x00000000006f0f10 <+0>:     lea    0x8(%rsp),%r10

   0x00000000006f0f15 <+5>:     and    $0xffffffffffffffe0,%rsp

   0x00000000006f0f19 <+9>:     pushq  -0x8(%r10)

   0x00000000006f0f1d <+13>:    push   %rbp

   0x00000000006f0f1e <+14>:    mov    %rsp,%rbp

   0x00000000006f0f21 <+17>:    push   %r10

   0x00000000006f0f23 <+19>:    sub    $0x68,%rsp

   0x00000000006f0f27 <+23>:    vmovaps %ymm0,-0x70(%rbp)

   0x00000000006f0f2c <+28>:    vxorps %xmm0,%xmm0,%xmm0

   0x00000000006f0f30 <+32>:    vmovaps -0x70(%rbp),%ymm4

   0x00000000006f0f35 <+37>:    vmovdqa -0x70(%rbp),%xmm1

   0x00000000006f0f3a <+42>:    vandps 0x3dade(%rip),%ymm4,%ymm8

   0x00000000006f0f42 <+50>:    vcmple_oqps %ymm0,%ymm4,%ymm3

   0x00000000006f0f47 <+55>:    vmovdqa -0x60(%rbp),%xmm5

   0x00000000006f0f4c <+60>:    vpsrld $0x17,%xmm1,%xmm2

   0x00000000006f0f51 <+65>:    mov    %fs:0x28,%rax

   0x00000000006f0f5a <+74>:    mov    %rax,-0x18(%rbp)

   0x00000000006f0f5e <+78>:    xor    %eax,%eax

   0x00000000006f0f60 <+80>:    vpsrld $0x17,%xmm5,%xmm6

   0x00000000006f0f65 <+85>:    vmovaps %xmm2,-0x50(%rbp)

   0x00000000006f0f6a <+90>:    mov    -0x18(%rbp),%rax

   0x00000000006f0f6e <+94>:    xor    %fs:0x28,%rax

   0x00000000006f0f77 <+103>:   vmovaps %xmm6,-0x40(%rbp)

   0x00000000006f0f7c <+108>:   vcvtdq2ps -0x50(%rbp),%ymm7

   0x00000000006f0f81 <+113>:   vorps  0x3dab7(%rip),%ymm8,%ymm9

   0x00000000006f0f89 <+121>:   vmulps 0x3db6f(%rip),%ymm7,%ymm1

   0x00000000006f0f91 <+129>:   vmulps 0x3dac7(%rip),%ymm9,%ymm10 

   0x00000000006f0f99 <+137>:   vaddps 0x3db7f(%rip),%ymm1,%ymm5 

   0x00000000006f0fa1 <+145>:   vaddps 0x3dad7(%rip),%ymm10,%ymm11

   0x00000000006f0fa9 <+153>:   vmulps %ymm9,%ymm11,%ymm12

   0x00000000006f0fae <+158>:   vaddps 0x3daea(%rip),%ymm12,%ymm13

   0x00000000006f0fb6 <+166>:   vmulps %ymm9,%ymm13,%ymm14

   0x00000000006f0fbb <+171>:   vaddps 0x3dafd(%rip),%ymm14,%ymm15

   0x00000000006f0fc3 <+179>:   vmulps %ymm9,%ymm15,%ymm0

   0x00000000006f0fc8 <+184>:   vaddps 0x3db10(%rip),%ymm0,%ymm4

   0x00000000006f0fd0 <+192>:   vmulps %ymm9,%ymm4,%ymm2

   0x00000000006f0fd5 <+197>:   vaddps %ymm5,%ymm2,%ymm6

   0x00000000006f0fd9 <+201>:   vorps  %ymm3,%ymm6,%ymm0

   0x00000000006f0fdd <+205>:   jne    0x6f0fea <_mm256_logf_app(float __vector(8))+218>

   0x00000000006f0fdf <+207>:   add    $0x68,%rsp

   0x00000000006f0fe3 <+211>:   pop    %rax

   0x00000000006f0fe4 <+212>:   pop    %rbp

   0x00000000006f0fe5 <+213>:   lea    -0x8(%rax),%rsp

   0x00000000006f0fe9 <+217>:   retq

The puzzling thing is this piece:

   0x00000000006f0f51 <+65>:    mov    %fs:0x28,%rax

   0x00000000006f0f5a <+74>:    mov    %rax,-0x18(%rbp)

   0x00000000006f0f5e <+78>:    xor    %eax,%eax

   0x00000000006f0f60 <+80>:    vpsrld $0x17,%xmm5,%xmm6

   0x00000000006f0f65 <+85>:    vmovaps %xmm2,-0x50(%rbp)

   0x00000000006f0f6a <+90>:    mov    -0x18(%rbp),%rax

   0x00000000006f0f6e <+94>:    xor    %fs:0x28,%rax

Which looks to me like an access of thread-local storage (fs register, I'm on x64 linux). It pulls from TLS, spills to the stack, then xors with something from TLS again? Why is it doing this? Intel doesn't. This is GCC 7.3

compiled with:

> g++ -std=c++11 -O3 -g3 -Wall -Wextra -fno-omit-frame-pointer -fopenmp -pthread -Idep/opt/include -Iinc/  -ffast-math -fPIC -mavx -funroll-loops -c lib/simd_avx.cc -o lib/simd_avx.o

edited Jan 3 at 0:01

asked Jan 2 at 23:56

Sean McAllister

5,672125184

1

That's the stack protector. Add -fno-stack-protector. It's not obvious to me why gcc decided to add it here as I can't spot anything that normally triggers it.

– Jester
Jan 3 at 0:04

Ah excellent, thank you. The intel version I'm fighting with is older so it doesn't have stack protection

– Sean McAllister
Jan 3 at 0:05

The only thing inside the protected region is the vmovaps %xmm2,-0x50(%rbp) which is a store of a known size to an automatic variable allocated by the compiler ... should be safe. There is nothing in the function to warrant a protector unless you have -fstack-protector-all or -fstack-protector-strong enabled by default for some reason.

– Jester
Jan 3 at 0:07

Yeah I don't have it on at all as far as I know...

– Sean McAllister
Jan 3 at 0:10

It looks like you compiled with optimization disabled; spilling xx to the stack on function entry is horrible. If you're looking at the asm for code with intrinsics, definitely use -O1 at least, preferably -O3. That might even optimize away the stack-protector stuff, because it can spend enough time optimizing to prove that this function doesn't need the stack protector code. And BTW, stack-protector isn't just a newer/older gcc thing. Your distro chose to configure the default setting to -fstack-protector-strong, but off is another possible setting for modern GCC.

– Peter Cordes
Jan 3 at 0:34

|
show 2 more comments

I'm running down some numeric difference between GCC/ICC and I've noticed something puzzling. When compiling this function (which computes an approximate natural logarithm):

// convenience macros for defining constant vectors

#define MM256_ICONST(val) (__m256)_mm256_set1_epi32(val);

#define MM256_FCONST(val)         _mm256_set1_ps   (val);



__m256 _mm256_logf_app(__m256 xx) {

    // constants

    const __m256 mant_mask = MM256_ICONST(0x007FFFFF); // mantissa mask

    const __m256  exp_mask = MM256_ICONST(0x3F800000); // 127 in exponent

    const __m256      mln2 = MM256_FCONST(M_LN2);      // log(2)



    // polynomial coefficients

    const __m256 a = MM256_FCONST(+3.529304993f);

    const __m256 b = MM256_FCONST(-2.461222105f);

    const __m256 c = MM256_FCONST(+1.130626167f);

    const __m256 d = MM256_FCONST(-0.288739945f);

    const __m256 e = MM256_FCONST(+3.110401639e-2f);

    const __m256 f = MM256_FCONST(-89.970756366f);



    // mask out anything <= 0

    __m256 invalid = _mm256_cmp_ps(xx, _mm256_setzero_ps(), _CMP_LE_OQ);



    // extract exponents

    __m256 exp = _mm256_cvtepi32_ps(

        avx2_mm256_srli_epi32((__m256i)xx, 23)

    );



    // clear exponent to 0 (+127)

    xx = _mm256_and_ps(xx, mant_mask);

    xx = _mm256_or_ps (xx,  exp_mask);



    // horner's rule to evaluate polynomial

    __m256 ret = e;

    ret = _mm256_add_ps(d, _mm256_mul_ps(ret,xx));

    ret = _mm256_add_ps(c, _mm256_mul_ps(ret,xx));

    ret = _mm256_add_ps(b, _mm256_mul_ps(ret,xx));

    ret = _mm256_add_ps(a, _mm256_mul_ps(ret,xx));

    ret = _mm256_add_ps(f, _mm256_mul_ps(ret,xx));



    // add in exponent contribution

    ret = _mm256_add_ps(ret,

        _mm256_mul_ps(exp, mln2)

    );



    return _mm256_or_ps(ret, invalid);

}

It generates this assembly:

   0x00000000006f0f10 <+0>:     lea    0x8(%rsp),%r10

   0x00000000006f0f15 <+5>:     and    $0xffffffffffffffe0,%rsp

   0x00000000006f0f19 <+9>:     pushq  -0x8(%r10)

   0x00000000006f0f1d <+13>:    push   %rbp

   0x00000000006f0f1e <+14>:    mov    %rsp,%rbp

   0x00000000006f0f21 <+17>:    push   %r10

   0x00000000006f0f23 <+19>:    sub    $0x68,%rsp

   0x00000000006f0f27 <+23>:    vmovaps %ymm0,-0x70(%rbp)

   0x00000000006f0f2c <+28>:    vxorps %xmm0,%xmm0,%xmm0

   0x00000000006f0f30 <+32>:    vmovaps -0x70(%rbp),%ymm4

   0x00000000006f0f35 <+37>:    vmovdqa -0x70(%rbp),%xmm1

   0x00000000006f0f3a <+42>:    vandps 0x3dade(%rip),%ymm4,%ymm8

   0x00000000006f0f42 <+50>:    vcmple_oqps %ymm0,%ymm4,%ymm3

   0x00000000006f0f47 <+55>:    vmovdqa -0x60(%rbp),%xmm5

   0x00000000006f0f4c <+60>:    vpsrld $0x17,%xmm1,%xmm2

   0x00000000006f0f51 <+65>:    mov    %fs:0x28,%rax

   0x00000000006f0f5a <+74>:    mov    %rax,-0x18(%rbp)

   0x00000000006f0f5e <+78>:    xor    %eax,%eax

   0x00000000006f0f60 <+80>:    vpsrld $0x17,%xmm5,%xmm6

   0x00000000006f0f65 <+85>:    vmovaps %xmm2,-0x50(%rbp)

   0x00000000006f0f6a <+90>:    mov    -0x18(%rbp),%rax

   0x00000000006f0f6e <+94>:    xor    %fs:0x28,%rax

   0x00000000006f0f77 <+103>:   vmovaps %xmm6,-0x40(%rbp)

   0x00000000006f0f7c <+108>:   vcvtdq2ps -0x50(%rbp),%ymm7

   0x00000000006f0f81 <+113>:   vorps  0x3dab7(%rip),%ymm8,%ymm9

   0x00000000006f0f89 <+121>:   vmulps 0x3db6f(%rip),%ymm7,%ymm1

   0x00000000006f0f91 <+129>:   vmulps 0x3dac7(%rip),%ymm9,%ymm10 

   0x00000000006f0f99 <+137>:   vaddps 0x3db7f(%rip),%ymm1,%ymm5 

   0x00000000006f0fa1 <+145>:   vaddps 0x3dad7(%rip),%ymm10,%ymm11

   0x00000000006f0fa9 <+153>:   vmulps %ymm9,%ymm11,%ymm12

   0x00000000006f0fae <+158>:   vaddps 0x3daea(%rip),%ymm12,%ymm13

   0x00000000006f0fb6 <+166>:   vmulps %ymm9,%ymm13,%ymm14

   0x00000000006f0fbb <+171>:   vaddps 0x3dafd(%rip),%ymm14,%ymm15

   0x00000000006f0fc3 <+179>:   vmulps %ymm9,%ymm15,%ymm0

   0x00000000006f0fc8 <+184>:   vaddps 0x3db10(%rip),%ymm0,%ymm4

   0x00000000006f0fd0 <+192>:   vmulps %ymm9,%ymm4,%ymm2

   0x00000000006f0fd5 <+197>:   vaddps %ymm5,%ymm2,%ymm6

   0x00000000006f0fd9 <+201>:   vorps  %ymm3,%ymm6,%ymm0

   0x00000000006f0fdd <+205>:   jne    0x6f0fea <_mm256_logf_app(float __vector(8))+218>

   0x00000000006f0fdf <+207>:   add    $0x68,%rsp

   0x00000000006f0fe3 <+211>:   pop    %rax

   0x00000000006f0fe4 <+212>:   pop    %rbp

   0x00000000006f0fe5 <+213>:   lea    -0x8(%rax),%rsp

   0x00000000006f0fe9 <+217>:   retq

The puzzling thing is this piece:

   0x00000000006f0f51 <+65>:    mov    %fs:0x28,%rax

   0x00000000006f0f5a <+74>:    mov    %rax,-0x18(%rbp)

   0x00000000006f0f5e <+78>:    xor    %eax,%eax

   0x00000000006f0f60 <+80>:    vpsrld $0x17,%xmm5,%xmm6

   0x00000000006f0f65 <+85>:    vmovaps %xmm2,-0x50(%rbp)

   0x00000000006f0f6a <+90>:    mov    -0x18(%rbp),%rax

   0x00000000006f0f6e <+94>:    xor    %fs:0x28,%rax

compiled with:

> g++ -std=c++11 -O3 -g3 -Wall -Wextra -fno-omit-frame-pointer -fopenmp -pthread -Idep/opt/include -Iinc/  -ffast-math -fPIC -mavx -funroll-loops -c lib/simd_avx.cc -o lib/simd_avx.o

edited Jan 3 at 0:01

asked Jan 2 at 23:56

Sean McAllister

5,672125184

1

That's the stack protector. Add -fno-stack-protector. It's not obvious to me why gcc decided to add it here as I can't spot anything that normally triggers it.

– Jester
Jan 3 at 0:04

Ah excellent, thank you. The intel version I'm fighting with is older so it doesn't have stack protection

– Sean McAllister
Jan 3 at 0:05

The only thing inside the protected region is the vmovaps %xmm2,-0x50(%rbp) which is a store of a known size to an automatic variable allocated by the compiler ... should be safe. There is nothing in the function to warrant a protector unless you have -fstack-protector-all or -fstack-protector-strong enabled by default for some reason.

– Jester
Jan 3 at 0:07

Yeah I don't have it on at all as far as I know...

– Sean McAllister
Jan 3 at 0:10

It looks like you compiled with optimization disabled; spilling xx to the stack on function entry is horrible. If you're looking at the asm for code with intrinsics, definitely use -O1 at least, preferably -O3. That might even optimize away the stack-protector stuff, because it can spend enough time optimizing to prove that this function doesn't need the stack protector code. And BTW, stack-protector isn't just a newer/older gcc thing. Your distro chose to configure the default setting to -fstack-protector-strong, but off is another possible setting for modern GCC.

– Peter Cordes
Jan 3 at 0:34

|
show 2 more comments

I'm running down some numeric difference between GCC/ICC and I've noticed something puzzling. When compiling this function (which computes an approximate natural logarithm):

// convenience macros for defining constant vectors

#define MM256_ICONST(val) (__m256)_mm256_set1_epi32(val);

#define MM256_FCONST(val)         _mm256_set1_ps   (val);



__m256 _mm256_logf_app(__m256 xx) {

    // constants

    const __m256 mant_mask = MM256_ICONST(0x007FFFFF); // mantissa mask

    const __m256  exp_mask = MM256_ICONST(0x3F800000); // 127 in exponent

    const __m256      mln2 = MM256_FCONST(M_LN2);      // log(2)



    // polynomial coefficients

    const __m256 a = MM256_FCONST(+3.529304993f);

    const __m256 b = MM256_FCONST(-2.461222105f);

    const __m256 c = MM256_FCONST(+1.130626167f);

    const __m256 d = MM256_FCONST(-0.288739945f);

    const __m256 e = MM256_FCONST(+3.110401639e-2f);

    const __m256 f = MM256_FCONST(-89.970756366f);



    // mask out anything <= 0

    __m256 invalid = _mm256_cmp_ps(xx, _mm256_setzero_ps(), _CMP_LE_OQ);



    // extract exponents

    __m256 exp = _mm256_cvtepi32_ps(

        avx2_mm256_srli_epi32((__m256i)xx, 23)

    );



    // clear exponent to 0 (+127)

    xx = _mm256_and_ps(xx, mant_mask);

    xx = _mm256_or_ps (xx,  exp_mask);



    // horner's rule to evaluate polynomial

    __m256 ret = e;

    ret = _mm256_add_ps(d, _mm256_mul_ps(ret,xx));

    ret = _mm256_add_ps(c, _mm256_mul_ps(ret,xx));

    ret = _mm256_add_ps(b, _mm256_mul_ps(ret,xx));

    ret = _mm256_add_ps(a, _mm256_mul_ps(ret,xx));

    ret = _mm256_add_ps(f, _mm256_mul_ps(ret,xx));



    // add in exponent contribution

    ret = _mm256_add_ps(ret,

        _mm256_mul_ps(exp, mln2)

    );



    return _mm256_or_ps(ret, invalid);

}

It generates this assembly:

   0x00000000006f0f10 <+0>:     lea    0x8(%rsp),%r10

   0x00000000006f0f15 <+5>:     and    $0xffffffffffffffe0,%rsp

   0x00000000006f0f19 <+9>:     pushq  -0x8(%r10)

   0x00000000006f0f1d <+13>:    push   %rbp

   0x00000000006f0f1e <+14>:    mov    %rsp,%rbp

   0x00000000006f0f21 <+17>:    push   %r10

   0x00000000006f0f23 <+19>:    sub    $0x68,%rsp

   0x00000000006f0f27 <+23>:    vmovaps %ymm0,-0x70(%rbp)

   0x00000000006f0f2c <+28>:    vxorps %xmm0,%xmm0,%xmm0

   0x00000000006f0f30 <+32>:    vmovaps -0x70(%rbp),%ymm4

   0x00000000006f0f35 <+37>:    vmovdqa -0x70(%rbp),%xmm1

   0x00000000006f0f3a <+42>:    vandps 0x3dade(%rip),%ymm4,%ymm8

   0x00000000006f0f42 <+50>:    vcmple_oqps %ymm0,%ymm4,%ymm3

   0x00000000006f0f47 <+55>:    vmovdqa -0x60(%rbp),%xmm5

   0x00000000006f0f4c <+60>:    vpsrld $0x17,%xmm1,%xmm2

   0x00000000006f0f51 <+65>:    mov    %fs:0x28,%rax

   0x00000000006f0f5a <+74>:    mov    %rax,-0x18(%rbp)

   0x00000000006f0f5e <+78>:    xor    %eax,%eax

   0x00000000006f0f60 <+80>:    vpsrld $0x17,%xmm5,%xmm6

   0x00000000006f0f65 <+85>:    vmovaps %xmm2,-0x50(%rbp)

   0x00000000006f0f6a <+90>:    mov    -0x18(%rbp),%rax

   0x00000000006f0f6e <+94>:    xor    %fs:0x28,%rax

   0x00000000006f0f77 <+103>:   vmovaps %xmm6,-0x40(%rbp)

   0x00000000006f0f7c <+108>:   vcvtdq2ps -0x50(%rbp),%ymm7

   0x00000000006f0f81 <+113>:   vorps  0x3dab7(%rip),%ymm8,%ymm9

   0x00000000006f0f89 <+121>:   vmulps 0x3db6f(%rip),%ymm7,%ymm1

   0x00000000006f0f91 <+129>:   vmulps 0x3dac7(%rip),%ymm9,%ymm10 

   0x00000000006f0f99 <+137>:   vaddps 0x3db7f(%rip),%ymm1,%ymm5 

   0x00000000006f0fa1 <+145>:   vaddps 0x3dad7(%rip),%ymm10,%ymm11

   0x00000000006f0fa9 <+153>:   vmulps %ymm9,%ymm11,%ymm12

   0x00000000006f0fae <+158>:   vaddps 0x3daea(%rip),%ymm12,%ymm13

   0x00000000006f0fb6 <+166>:   vmulps %ymm9,%ymm13,%ymm14

   0x00000000006f0fbb <+171>:   vaddps 0x3dafd(%rip),%ymm14,%ymm15

   0x00000000006f0fc3 <+179>:   vmulps %ymm9,%ymm15,%ymm0

   0x00000000006f0fc8 <+184>:   vaddps 0x3db10(%rip),%ymm0,%ymm4

   0x00000000006f0fd0 <+192>:   vmulps %ymm9,%ymm4,%ymm2

   0x00000000006f0fd5 <+197>:   vaddps %ymm5,%ymm2,%ymm6

   0x00000000006f0fd9 <+201>:   vorps  %ymm3,%ymm6,%ymm0

   0x00000000006f0fdd <+205>:   jne    0x6f0fea <_mm256_logf_app(float __vector(8))+218>

   0x00000000006f0fdf <+207>:   add    $0x68,%rsp

   0x00000000006f0fe3 <+211>:   pop    %rax

   0x00000000006f0fe4 <+212>:   pop    %rbp

   0x00000000006f0fe5 <+213>:   lea    -0x8(%rax),%rsp

   0x00000000006f0fe9 <+217>:   retq

The puzzling thing is this piece:

   0x00000000006f0f51 <+65>:    mov    %fs:0x28,%rax

   0x00000000006f0f5a <+74>:    mov    %rax,-0x18(%rbp)

   0x00000000006f0f5e <+78>:    xor    %eax,%eax

   0x00000000006f0f60 <+80>:    vpsrld $0x17,%xmm5,%xmm6

   0x00000000006f0f65 <+85>:    vmovaps %xmm2,-0x50(%rbp)

   0x00000000006f0f6a <+90>:    mov    -0x18(%rbp),%rax

   0x00000000006f0f6e <+94>:    xor    %fs:0x28,%rax

compiled with:

> g++ -std=c++11 -O3 -g3 -Wall -Wextra -fno-omit-frame-pointer -fopenmp -pthread -Idep/opt/include -Iinc/  -ffast-math -fPIC -mavx -funroll-loops -c lib/simd_avx.cc -o lib/simd_avx.o

edited Jan 3 at 0:01

asked Jan 2 at 23:56

Sean McAllister

5,672125184

I'm running down some numeric difference between GCC/ICC and I've noticed something puzzling. When compiling this function (which computes an approximate natural logarithm):

// convenience macros for defining constant vectors

#define MM256_ICONST(val) (__m256)_mm256_set1_epi32(val);

#define MM256_FCONST(val)         _mm256_set1_ps   (val);



__m256 _mm256_logf_app(__m256 xx) {

    // constants

    const __m256 mant_mask = MM256_ICONST(0x007FFFFF); // mantissa mask

    const __m256  exp_mask = MM256_ICONST(0x3F800000); // 127 in exponent

    const __m256      mln2 = MM256_FCONST(M_LN2);      // log(2)



    // polynomial coefficients

    const __m256 a = MM256_FCONST(+3.529304993f);

    const __m256 b = MM256_FCONST(-2.461222105f);

    const __m256 c = MM256_FCONST(+1.130626167f);

    const __m256 d = MM256_FCONST(-0.288739945f);

    const __m256 e = MM256_FCONST(+3.110401639e-2f);

    const __m256 f = MM256_FCONST(-89.970756366f);



    // mask out anything <= 0

    __m256 invalid = _mm256_cmp_ps(xx, _mm256_setzero_ps(), _CMP_LE_OQ);



    // extract exponents

    __m256 exp = _mm256_cvtepi32_ps(

        avx2_mm256_srli_epi32((__m256i)xx, 23)

    );



    // clear exponent to 0 (+127)

    xx = _mm256_and_ps(xx, mant_mask);

    xx = _mm256_or_ps (xx,  exp_mask);



    // horner's rule to evaluate polynomial

    __m256 ret = e;

    ret = _mm256_add_ps(d, _mm256_mul_ps(ret,xx));

    ret = _mm256_add_ps(c, _mm256_mul_ps(ret,xx));

    ret = _mm256_add_ps(b, _mm256_mul_ps(ret,xx));

    ret = _mm256_add_ps(a, _mm256_mul_ps(ret,xx));

    ret = _mm256_add_ps(f, _mm256_mul_ps(ret,xx));



    // add in exponent contribution

    ret = _mm256_add_ps(ret,

        _mm256_mul_ps(exp, mln2)

    );



    return _mm256_or_ps(ret, invalid);

}

It generates this assembly:

   0x00000000006f0f10 <+0>:     lea    0x8(%rsp),%r10

   0x00000000006f0f15 <+5>:     and    $0xffffffffffffffe0,%rsp

   0x00000000006f0f19 <+9>:     pushq  -0x8(%r10)

   0x00000000006f0f1d <+13>:    push   %rbp

   0x00000000006f0f1e <+14>:    mov    %rsp,%rbp

   0x00000000006f0f21 <+17>:    push   %r10

   0x00000000006f0f23 <+19>:    sub    $0x68,%rsp

   0x00000000006f0f27 <+23>:    vmovaps %ymm0,-0x70(%rbp)

   0x00000000006f0f2c <+28>:    vxorps %xmm0,%xmm0,%xmm0

   0x00000000006f0f30 <+32>:    vmovaps -0x70(%rbp),%ymm4

   0x00000000006f0f35 <+37>:    vmovdqa -0x70(%rbp),%xmm1

   0x00000000006f0f3a <+42>:    vandps 0x3dade(%rip),%ymm4,%ymm8

   0x00000000006f0f42 <+50>:    vcmple_oqps %ymm0,%ymm4,%ymm3

   0x00000000006f0f47 <+55>:    vmovdqa -0x60(%rbp),%xmm5

   0x00000000006f0f4c <+60>:    vpsrld $0x17,%xmm1,%xmm2

   0x00000000006f0f51 <+65>:    mov    %fs:0x28,%rax

   0x00000000006f0f5a <+74>:    mov    %rax,-0x18(%rbp)

   0x00000000006f0f5e <+78>:    xor    %eax,%eax

   0x00000000006f0f60 <+80>:    vpsrld $0x17,%xmm5,%xmm6

   0x00000000006f0f65 <+85>:    vmovaps %xmm2,-0x50(%rbp)

   0x00000000006f0f6a <+90>:    mov    -0x18(%rbp),%rax

   0x00000000006f0f6e <+94>:    xor    %fs:0x28,%rax

   0x00000000006f0f77 <+103>:   vmovaps %xmm6,-0x40(%rbp)

   0x00000000006f0f7c <+108>:   vcvtdq2ps -0x50(%rbp),%ymm7

   0x00000000006f0f81 <+113>:   vorps  0x3dab7(%rip),%ymm8,%ymm9

   0x00000000006f0f89 <+121>:   vmulps 0x3db6f(%rip),%ymm7,%ymm1

   0x00000000006f0f91 <+129>:   vmulps 0x3dac7(%rip),%ymm9,%ymm10 

   0x00000000006f0f99 <+137>:   vaddps 0x3db7f(%rip),%ymm1,%ymm5 

   0x00000000006f0fa1 <+145>:   vaddps 0x3dad7(%rip),%ymm10,%ymm11

   0x00000000006f0fa9 <+153>:   vmulps %ymm9,%ymm11,%ymm12

   0x00000000006f0fae <+158>:   vaddps 0x3daea(%rip),%ymm12,%ymm13

   0x00000000006f0fb6 <+166>:   vmulps %ymm9,%ymm13,%ymm14

   0x00000000006f0fbb <+171>:   vaddps 0x3dafd(%rip),%ymm14,%ymm15

   0x00000000006f0fc3 <+179>:   vmulps %ymm9,%ymm15,%ymm0

   0x00000000006f0fc8 <+184>:   vaddps 0x3db10(%rip),%ymm0,%ymm4

   0x00000000006f0fd0 <+192>:   vmulps %ymm9,%ymm4,%ymm2

   0x00000000006f0fd5 <+197>:   vaddps %ymm5,%ymm2,%ymm6

   0x00000000006f0fd9 <+201>:   vorps  %ymm3,%ymm6,%ymm0

   0x00000000006f0fdd <+205>:   jne    0x6f0fea <_mm256_logf_app(float __vector(8))+218>

   0x00000000006f0fdf <+207>:   add    $0x68,%rsp

   0x00000000006f0fe3 <+211>:   pop    %rax

   0x00000000006f0fe4 <+212>:   pop    %rbp

   0x00000000006f0fe5 <+213>:   lea    -0x8(%rax),%rsp

   0x00000000006f0fe9 <+217>:   retq

The puzzling thing is this piece:

   0x00000000006f0f51 <+65>:    mov    %fs:0x28,%rax

   0x00000000006f0f5a <+74>:    mov    %rax,-0x18(%rbp)

   0x00000000006f0f5e <+78>:    xor    %eax,%eax

   0x00000000006f0f60 <+80>:    vpsrld $0x17,%xmm5,%xmm6

   0x00000000006f0f65 <+85>:    vmovaps %xmm2,-0x50(%rbp)

   0x00000000006f0f6a <+90>:    mov    -0x18(%rbp),%rax

   0x00000000006f0f6e <+94>:    xor    %fs:0x28,%rax

compiled with:

> g++ -std=c++11 -O3 -g3 -Wall -Wextra -fno-omit-frame-pointer -fopenmp -pthread -Idep/opt/include -Iinc/  -ffast-math -fPIC -mavx -funroll-loops -c lib/simd_avx.cc -o lib/simd_avx.o

gcc assembly simd

edited Jan 3 at 0:01

asked Jan 2 at 23:56

Sean McAllister

5,672125184

edited Jan 3 at 0:01

asked Jan 2 at 23:56

Sean McAllister

5,672125184

edited Jan 3 at 0:01

asked Jan 2 at 23:56

Sean McAllister

5,672125184

asked Jan 2 at 23:56

Sean McAllister

5,672125184

asked Jan 2 at 23:56

Sean McAllister

5,672125184

1

That's the stack protector. Add -fno-stack-protector. It's not obvious to me why gcc decided to add it here as I can't spot anything that normally triggers it.

– Jester
Jan 3 at 0:04

Ah excellent, thank you. The intel version I'm fighting with is older so it doesn't have stack protection

– Sean McAllister
Jan 3 at 0:05

The only thing inside the protected region is the vmovaps %xmm2,-0x50(%rbp) which is a store of a known size to an automatic variable allocated by the compiler ... should be safe. There is nothing in the function to warrant a protector unless you have -fstack-protector-all or -fstack-protector-strong enabled by default for some reason.

– Jester
Jan 3 at 0:07

Yeah I don't have it on at all as far as I know...

– Sean McAllister
Jan 3 at 0:10

It looks like you compiled with optimization disabled; spilling xx to the stack on function entry is horrible. If you're looking at the asm for code with intrinsics, definitely use -O1 at least, preferably -O3. That might even optimize away the stack-protector stuff, because it can spend enough time optimizing to prove that this function doesn't need the stack protector code. And BTW, stack-protector isn't just a newer/older gcc thing. Your distro chose to configure the default setting to -fstack-protector-strong, but off is another possible setting for modern GCC.

– Peter Cordes
Jan 3 at 0:34

|
show 2 more comments

1

That's the stack protector. Add -fno-stack-protector. It's not obvious to me why gcc decided to add it here as I can't spot anything that normally triggers it.

– Jester
Jan 3 at 0:04

Ah excellent, thank you. The intel version I'm fighting with is older so it doesn't have stack protection

– Sean McAllister
Jan 3 at 0:05

The only thing inside the protected region is the vmovaps %xmm2,-0x50(%rbp) which is a store of a known size to an automatic variable allocated by the compiler ... should be safe. There is nothing in the function to warrant a protector unless you have -fstack-protector-all or -fstack-protector-strong enabled by default for some reason.

– Jester
Jan 3 at 0:07

Yeah I don't have it on at all as far as I know...

– Sean McAllister
Jan 3 at 0:10

It looks like you compiled with optimization disabled; spilling xx to the stack on function entry is horrible. If you're looking at the asm for code with intrinsics, definitely use -O1 at least, preferably -O3. That might even optimize away the stack-protector stuff, because it can spend enough time optimizing to prove that this function doesn't need the stack protector code. And BTW, stack-protector isn't just a newer/older gcc thing. Your distro chose to configure the default setting to -fstack-protector-strong, but off is another possible setting for modern GCC.

– Peter Cordes
Jan 3 at 0:34

That's the stack protector. Add -fno-stack-protector. It's not obvious to me why gcc decided to add it here as I can't spot anything that normally triggers it.

– Jester
Jan 3 at 0:04

Ah excellent, thank you. The intel version I'm fighting with is older so it doesn't have stack protection

– Sean McAllister
Jan 3 at 0:05

The only thing inside the protected region is the vmovaps %xmm2,-0x50(%rbp) which is a store of a known size to an automatic variable allocated by the compiler ... should be safe. There is nothing in the function to warrant a protector unless you have -fstack-protector-all or -fstack-protector-strong enabled by default for some reason.

– Jester
Jan 3 at 0:07

Yeah I don't have it on at all as far as I know...

– Sean McAllister
Jan 3 at 0:10

It looks like you compiled with optimization disabled; spilling xx to the stack on function entry is horrible. If you're looking at the asm for code with intrinsics, definitely use -O1 at least, preferably -O3. That might even optimize away the stack-protector stuff, because it can spend enough time optimizing to prove that this function doesn't need the stack protector code. And BTW, stack-protector isn't just a newer/older gcc thing. Your distro chose to configure the default setting to -fstack-protector-strong, but off is another possible setting for modern GCC.

– Peter Cordes
Jan 3 at 0:34

|
show 2 more comments

0

active

oldest

votes

Your Answer

StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");

StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});

function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});

}
});

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f54014708%2fgcc-accessing-thread-local-storage-unnecessarily%23new-answer', 'question_page');
}
);

Post as a guest

Name

Required, but never shown

0

active

oldest

votes

0

active

oldest

votes

draft saved

draft discarded

Thanks for contributing an answer to Stack Overflow!

Please be sure to answer the question. Provide details and share your research!

But avoid …

Asking for help, clarification, or responding to other answers.

Making statements based on opinion; back them up with references or personal experience.

To learn more, see our tips on writing great answers.

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Name

Required, but never shown

Name

Required, but never shown

This page is only for reference, If you need detailed information, please check here

Search This Blog

Ufyukyu