GCC accessing thread local storage unnecessarily?
I'm running down some numeric difference between GCC/ICC and I've noticed something puzzling. When compiling this function (which computes an approximate natural logarithm):
// convenience macros for defining constant vectors
#define MM256_ICONST(val) (__m256)_mm256_set1_epi32(val);
#define MM256_FCONST(val) _mm256_set1_ps (val);
__m256 _mm256_logf_app(__m256 xx) {
// constants
const __m256 mant_mask = MM256_ICONST(0x007FFFFF); // mantissa mask
const __m256 exp_mask = MM256_ICONST(0x3F800000); // 127 in exponent
const __m256 mln2 = MM256_FCONST(M_LN2); // log(2)
// polynomial coefficients
const __m256 a = MM256_FCONST(+3.529304993f);
const __m256 b = MM256_FCONST(-2.461222105f);
const __m256 c = MM256_FCONST(+1.130626167f);
const __m256 d = MM256_FCONST(-0.288739945f);
const __m256 e = MM256_FCONST(+3.110401639e-2f);
const __m256 f = MM256_FCONST(-89.970756366f);
// mask out anything <= 0
__m256 invalid = _mm256_cmp_ps(xx, _mm256_setzero_ps(), _CMP_LE_OQ);
// extract exponents
__m256 exp = _mm256_cvtepi32_ps(
avx2_mm256_srli_epi32((__m256i)xx, 23)
);
// clear exponent to 0 (+127)
xx = _mm256_and_ps(xx, mant_mask);
xx = _mm256_or_ps (xx, exp_mask);
// horner's rule to evaluate polynomial
__m256 ret = e;
ret = _mm256_add_ps(d, _mm256_mul_ps(ret,xx));
ret = _mm256_add_ps(c, _mm256_mul_ps(ret,xx));
ret = _mm256_add_ps(b, _mm256_mul_ps(ret,xx));
ret = _mm256_add_ps(a, _mm256_mul_ps(ret,xx));
ret = _mm256_add_ps(f, _mm256_mul_ps(ret,xx));
// add in exponent contribution
ret = _mm256_add_ps(ret,
_mm256_mul_ps(exp, mln2)
);
return _mm256_or_ps(ret, invalid);
}
It generates this assembly:
0x00000000006f0f10 <+0>: lea 0x8(%rsp),%r10
0x00000000006f0f15 <+5>: and $0xffffffffffffffe0,%rsp
0x00000000006f0f19 <+9>: pushq -0x8(%r10)
0x00000000006f0f1d <+13>: push %rbp
0x00000000006f0f1e <+14>: mov %rsp,%rbp
0x00000000006f0f21 <+17>: push %r10
0x00000000006f0f23 <+19>: sub $0x68,%rsp
0x00000000006f0f27 <+23>: vmovaps %ymm0,-0x70(%rbp)
0x00000000006f0f2c <+28>: vxorps %xmm0,%xmm0,%xmm0
0x00000000006f0f30 <+32>: vmovaps -0x70(%rbp),%ymm4
0x00000000006f0f35 <+37>: vmovdqa -0x70(%rbp),%xmm1
0x00000000006f0f3a <+42>: vandps 0x3dade(%rip),%ymm4,%ymm8
0x00000000006f0f42 <+50>: vcmple_oqps %ymm0,%ymm4,%ymm3
0x00000000006f0f47 <+55>: vmovdqa -0x60(%rbp),%xmm5
0x00000000006f0f4c <+60>: vpsrld $0x17,%xmm1,%xmm2
0x00000000006f0f51 <+65>: mov %fs:0x28,%rax
0x00000000006f0f5a <+74>: mov %rax,-0x18(%rbp)
0x00000000006f0f5e <+78>: xor %eax,%eax
0x00000000006f0f60 <+80>: vpsrld $0x17,%xmm5,%xmm6
0x00000000006f0f65 <+85>: vmovaps %xmm2,-0x50(%rbp)
0x00000000006f0f6a <+90>: mov -0x18(%rbp),%rax
0x00000000006f0f6e <+94>: xor %fs:0x28,%rax
0x00000000006f0f77 <+103>: vmovaps %xmm6,-0x40(%rbp)
0x00000000006f0f7c <+108>: vcvtdq2ps -0x50(%rbp),%ymm7
0x00000000006f0f81 <+113>: vorps 0x3dab7(%rip),%ymm8,%ymm9
0x00000000006f0f89 <+121>: vmulps 0x3db6f(%rip),%ymm7,%ymm1
0x00000000006f0f91 <+129>: vmulps 0x3dac7(%rip),%ymm9,%ymm10
0x00000000006f0f99 <+137>: vaddps 0x3db7f(%rip),%ymm1,%ymm5
0x00000000006f0fa1 <+145>: vaddps 0x3dad7(%rip),%ymm10,%ymm11
0x00000000006f0fa9 <+153>: vmulps %ymm9,%ymm11,%ymm12
0x00000000006f0fae <+158>: vaddps 0x3daea(%rip),%ymm12,%ymm13
0x00000000006f0fb6 <+166>: vmulps %ymm9,%ymm13,%ymm14
0x00000000006f0fbb <+171>: vaddps 0x3dafd(%rip),%ymm14,%ymm15
0x00000000006f0fc3 <+179>: vmulps %ymm9,%ymm15,%ymm0
0x00000000006f0fc8 <+184>: vaddps 0x3db10(%rip),%ymm0,%ymm4
0x00000000006f0fd0 <+192>: vmulps %ymm9,%ymm4,%ymm2
0x00000000006f0fd5 <+197>: vaddps %ymm5,%ymm2,%ymm6
0x00000000006f0fd9 <+201>: vorps %ymm3,%ymm6,%ymm0
0x00000000006f0fdd <+205>: jne 0x6f0fea <_mm256_logf_app(float __vector(8))+218>
0x00000000006f0fdf <+207>: add $0x68,%rsp
0x00000000006f0fe3 <+211>: pop %rax
0x00000000006f0fe4 <+212>: pop %rbp
0x00000000006f0fe5 <+213>: lea -0x8(%rax),%rsp
0x00000000006f0fe9 <+217>: retq
The puzzling thing is this piece:
0x00000000006f0f51 <+65>: mov %fs:0x28,%rax
0x00000000006f0f5a <+74>: mov %rax,-0x18(%rbp)
0x00000000006f0f5e <+78>: xor %eax,%eax
0x00000000006f0f60 <+80>: vpsrld $0x17,%xmm5,%xmm6
0x00000000006f0f65 <+85>: vmovaps %xmm2,-0x50(%rbp)
0x00000000006f0f6a <+90>: mov -0x18(%rbp),%rax
0x00000000006f0f6e <+94>: xor %fs:0x28,%rax
Which looks to me like an access of thread-local storage (fs register, I'm on x64 linux). It pulls from TLS, spills to the stack, then xors with something from TLS again? Why is it doing this? Intel doesn't. This is GCC 7.3
compiled with:
> g++ -std=c++11 -O3 -g3 -Wall -Wextra -fno-omit-frame-pointer -fopenmp -pthread -Idep/opt/include -Iinc/ -ffast-math -fPIC -mavx -funroll-loops -c lib/simd_avx.cc -o lib/simd_avx.o
gcc assembly simd
|
show 2 more comments
I'm running down some numeric difference between GCC/ICC and I've noticed something puzzling. When compiling this function (which computes an approximate natural logarithm):
// convenience macros for defining constant vectors
#define MM256_ICONST(val) (__m256)_mm256_set1_epi32(val);
#define MM256_FCONST(val) _mm256_set1_ps (val);
__m256 _mm256_logf_app(__m256 xx) {
// constants
const __m256 mant_mask = MM256_ICONST(0x007FFFFF); // mantissa mask
const __m256 exp_mask = MM256_ICONST(0x3F800000); // 127 in exponent
const __m256 mln2 = MM256_FCONST(M_LN2); // log(2)
// polynomial coefficients
const __m256 a = MM256_FCONST(+3.529304993f);
const __m256 b = MM256_FCONST(-2.461222105f);
const __m256 c = MM256_FCONST(+1.130626167f);
const __m256 d = MM256_FCONST(-0.288739945f);
const __m256 e = MM256_FCONST(+3.110401639e-2f);
const __m256 f = MM256_FCONST(-89.970756366f);
// mask out anything <= 0
__m256 invalid = _mm256_cmp_ps(xx, _mm256_setzero_ps(), _CMP_LE_OQ);
// extract exponents
__m256 exp = _mm256_cvtepi32_ps(
avx2_mm256_srli_epi32((__m256i)xx, 23)
);
// clear exponent to 0 (+127)
xx = _mm256_and_ps(xx, mant_mask);
xx = _mm256_or_ps (xx, exp_mask);
// horner's rule to evaluate polynomial
__m256 ret = e;
ret = _mm256_add_ps(d, _mm256_mul_ps(ret,xx));
ret = _mm256_add_ps(c, _mm256_mul_ps(ret,xx));
ret = _mm256_add_ps(b, _mm256_mul_ps(ret,xx));
ret = _mm256_add_ps(a, _mm256_mul_ps(ret,xx));
ret = _mm256_add_ps(f, _mm256_mul_ps(ret,xx));
// add in exponent contribution
ret = _mm256_add_ps(ret,
_mm256_mul_ps(exp, mln2)
);
return _mm256_or_ps(ret, invalid);
}
It generates this assembly:
0x00000000006f0f10 <+0>: lea 0x8(%rsp),%r10
0x00000000006f0f15 <+5>: and $0xffffffffffffffe0,%rsp
0x00000000006f0f19 <+9>: pushq -0x8(%r10)
0x00000000006f0f1d <+13>: push %rbp
0x00000000006f0f1e <+14>: mov %rsp,%rbp
0x00000000006f0f21 <+17>: push %r10
0x00000000006f0f23 <+19>: sub $0x68,%rsp
0x00000000006f0f27 <+23>: vmovaps %ymm0,-0x70(%rbp)
0x00000000006f0f2c <+28>: vxorps %xmm0,%xmm0,%xmm0
0x00000000006f0f30 <+32>: vmovaps -0x70(%rbp),%ymm4
0x00000000006f0f35 <+37>: vmovdqa -0x70(%rbp),%xmm1
0x00000000006f0f3a <+42>: vandps 0x3dade(%rip),%ymm4,%ymm8
0x00000000006f0f42 <+50>: vcmple_oqps %ymm0,%ymm4,%ymm3
0x00000000006f0f47 <+55>: vmovdqa -0x60(%rbp),%xmm5
0x00000000006f0f4c <+60>: vpsrld $0x17,%xmm1,%xmm2
0x00000000006f0f51 <+65>: mov %fs:0x28,%rax
0x00000000006f0f5a <+74>: mov %rax,-0x18(%rbp)
0x00000000006f0f5e <+78>: xor %eax,%eax
0x00000000006f0f60 <+80>: vpsrld $0x17,%xmm5,%xmm6
0x00000000006f0f65 <+85>: vmovaps %xmm2,-0x50(%rbp)
0x00000000006f0f6a <+90>: mov -0x18(%rbp),%rax
0x00000000006f0f6e <+94>: xor %fs:0x28,%rax
0x00000000006f0f77 <+103>: vmovaps %xmm6,-0x40(%rbp)
0x00000000006f0f7c <+108>: vcvtdq2ps -0x50(%rbp),%ymm7
0x00000000006f0f81 <+113>: vorps 0x3dab7(%rip),%ymm8,%ymm9
0x00000000006f0f89 <+121>: vmulps 0x3db6f(%rip),%ymm7,%ymm1
0x00000000006f0f91 <+129>: vmulps 0x3dac7(%rip),%ymm9,%ymm10
0x00000000006f0f99 <+137>: vaddps 0x3db7f(%rip),%ymm1,%ymm5
0x00000000006f0fa1 <+145>: vaddps 0x3dad7(%rip),%ymm10,%ymm11
0x00000000006f0fa9 <+153>: vmulps %ymm9,%ymm11,%ymm12
0x00000000006f0fae <+158>: vaddps 0x3daea(%rip),%ymm12,%ymm13
0x00000000006f0fb6 <+166>: vmulps %ymm9,%ymm13,%ymm14
0x00000000006f0fbb <+171>: vaddps 0x3dafd(%rip),%ymm14,%ymm15
0x00000000006f0fc3 <+179>: vmulps %ymm9,%ymm15,%ymm0
0x00000000006f0fc8 <+184>: vaddps 0x3db10(%rip),%ymm0,%ymm4
0x00000000006f0fd0 <+192>: vmulps %ymm9,%ymm4,%ymm2
0x00000000006f0fd5 <+197>: vaddps %ymm5,%ymm2,%ymm6
0x00000000006f0fd9 <+201>: vorps %ymm3,%ymm6,%ymm0
0x00000000006f0fdd <+205>: jne 0x6f0fea <_mm256_logf_app(float __vector(8))+218>
0x00000000006f0fdf <+207>: add $0x68,%rsp
0x00000000006f0fe3 <+211>: pop %rax
0x00000000006f0fe4 <+212>: pop %rbp
0x00000000006f0fe5 <+213>: lea -0x8(%rax),%rsp
0x00000000006f0fe9 <+217>: retq
The puzzling thing is this piece:
0x00000000006f0f51 <+65>: mov %fs:0x28,%rax
0x00000000006f0f5a <+74>: mov %rax,-0x18(%rbp)
0x00000000006f0f5e <+78>: xor %eax,%eax
0x00000000006f0f60 <+80>: vpsrld $0x17,%xmm5,%xmm6
0x00000000006f0f65 <+85>: vmovaps %xmm2,-0x50(%rbp)
0x00000000006f0f6a <+90>: mov -0x18(%rbp),%rax
0x00000000006f0f6e <+94>: xor %fs:0x28,%rax
Which looks to me like an access of thread-local storage (fs register, I'm on x64 linux). It pulls from TLS, spills to the stack, then xors with something from TLS again? Why is it doing this? Intel doesn't. This is GCC 7.3
compiled with:
> g++ -std=c++11 -O3 -g3 -Wall -Wextra -fno-omit-frame-pointer -fopenmp -pthread -Idep/opt/include -Iinc/ -ffast-math -fPIC -mavx -funroll-loops -c lib/simd_avx.cc -o lib/simd_avx.o
gcc assembly simd
1
That's the stack protector. Add-fno-stack-protector
. It's not obvious to me why gcc decided to add it here as I can't spot anything that normally triggers it.
– Jester
Jan 3 at 0:04
Ah excellent, thank you. The intel version I'm fighting with is older so it doesn't have stack protection
– Sean McAllister
Jan 3 at 0:05
The only thing inside the protected region is thevmovaps %xmm2,-0x50(%rbp)
which is a store of a known size to an automatic variable allocated by the compiler ... should be safe. There is nothing in the function to warrant a protector unless you have-fstack-protector-all
or-fstack-protector-strong
enabled by default for some reason.
– Jester
Jan 3 at 0:07
Yeah I don't have it on at all as far as I know...
– Sean McAllister
Jan 3 at 0:10
It looks like you compiled with optimization disabled; spillingxx
to the stack on function entry is horrible. If you're looking at the asm for code with intrinsics, definitely use-O1
at least, preferably-O3
. That might even optimize away the stack-protector stuff, because it can spend enough time optimizing to prove that this function doesn't need the stack protector code. And BTW, stack-protector isn't just a newer/older gcc thing. Your distro chose to configure the default setting to-fstack-protector-strong
, but off is another possible setting for modern GCC.
– Peter Cordes
Jan 3 at 0:34
|
show 2 more comments
I'm running down some numeric difference between GCC/ICC and I've noticed something puzzling. When compiling this function (which computes an approximate natural logarithm):
// convenience macros for defining constant vectors
#define MM256_ICONST(val) (__m256)_mm256_set1_epi32(val);
#define MM256_FCONST(val) _mm256_set1_ps (val);
__m256 _mm256_logf_app(__m256 xx) {
// constants
const __m256 mant_mask = MM256_ICONST(0x007FFFFF); // mantissa mask
const __m256 exp_mask = MM256_ICONST(0x3F800000); // 127 in exponent
const __m256 mln2 = MM256_FCONST(M_LN2); // log(2)
// polynomial coefficients
const __m256 a = MM256_FCONST(+3.529304993f);
const __m256 b = MM256_FCONST(-2.461222105f);
const __m256 c = MM256_FCONST(+1.130626167f);
const __m256 d = MM256_FCONST(-0.288739945f);
const __m256 e = MM256_FCONST(+3.110401639e-2f);
const __m256 f = MM256_FCONST(-89.970756366f);
// mask out anything <= 0
__m256 invalid = _mm256_cmp_ps(xx, _mm256_setzero_ps(), _CMP_LE_OQ);
// extract exponents
__m256 exp = _mm256_cvtepi32_ps(
avx2_mm256_srli_epi32((__m256i)xx, 23)
);
// clear exponent to 0 (+127)
xx = _mm256_and_ps(xx, mant_mask);
xx = _mm256_or_ps (xx, exp_mask);
// horner's rule to evaluate polynomial
__m256 ret = e;
ret = _mm256_add_ps(d, _mm256_mul_ps(ret,xx));
ret = _mm256_add_ps(c, _mm256_mul_ps(ret,xx));
ret = _mm256_add_ps(b, _mm256_mul_ps(ret,xx));
ret = _mm256_add_ps(a, _mm256_mul_ps(ret,xx));
ret = _mm256_add_ps(f, _mm256_mul_ps(ret,xx));
// add in exponent contribution
ret = _mm256_add_ps(ret,
_mm256_mul_ps(exp, mln2)
);
return _mm256_or_ps(ret, invalid);
}
It generates this assembly:
0x00000000006f0f10 <+0>: lea 0x8(%rsp),%r10
0x00000000006f0f15 <+5>: and $0xffffffffffffffe0,%rsp
0x00000000006f0f19 <+9>: pushq -0x8(%r10)
0x00000000006f0f1d <+13>: push %rbp
0x00000000006f0f1e <+14>: mov %rsp,%rbp
0x00000000006f0f21 <+17>: push %r10
0x00000000006f0f23 <+19>: sub $0x68,%rsp
0x00000000006f0f27 <+23>: vmovaps %ymm0,-0x70(%rbp)
0x00000000006f0f2c <+28>: vxorps %xmm0,%xmm0,%xmm0
0x00000000006f0f30 <+32>: vmovaps -0x70(%rbp),%ymm4
0x00000000006f0f35 <+37>: vmovdqa -0x70(%rbp),%xmm1
0x00000000006f0f3a <+42>: vandps 0x3dade(%rip),%ymm4,%ymm8
0x00000000006f0f42 <+50>: vcmple_oqps %ymm0,%ymm4,%ymm3
0x00000000006f0f47 <+55>: vmovdqa -0x60(%rbp),%xmm5
0x00000000006f0f4c <+60>: vpsrld $0x17,%xmm1,%xmm2
0x00000000006f0f51 <+65>: mov %fs:0x28,%rax
0x00000000006f0f5a <+74>: mov %rax,-0x18(%rbp)
0x00000000006f0f5e <+78>: xor %eax,%eax
0x00000000006f0f60 <+80>: vpsrld $0x17,%xmm5,%xmm6
0x00000000006f0f65 <+85>: vmovaps %xmm2,-0x50(%rbp)
0x00000000006f0f6a <+90>: mov -0x18(%rbp),%rax
0x00000000006f0f6e <+94>: xor %fs:0x28,%rax
0x00000000006f0f77 <+103>: vmovaps %xmm6,-0x40(%rbp)
0x00000000006f0f7c <+108>: vcvtdq2ps -0x50(%rbp),%ymm7
0x00000000006f0f81 <+113>: vorps 0x3dab7(%rip),%ymm8,%ymm9
0x00000000006f0f89 <+121>: vmulps 0x3db6f(%rip),%ymm7,%ymm1
0x00000000006f0f91 <+129>: vmulps 0x3dac7(%rip),%ymm9,%ymm10
0x00000000006f0f99 <+137>: vaddps 0x3db7f(%rip),%ymm1,%ymm5
0x00000000006f0fa1 <+145>: vaddps 0x3dad7(%rip),%ymm10,%ymm11
0x00000000006f0fa9 <+153>: vmulps %ymm9,%ymm11,%ymm12
0x00000000006f0fae <+158>: vaddps 0x3daea(%rip),%ymm12,%ymm13
0x00000000006f0fb6 <+166>: vmulps %ymm9,%ymm13,%ymm14
0x00000000006f0fbb <+171>: vaddps 0x3dafd(%rip),%ymm14,%ymm15
0x00000000006f0fc3 <+179>: vmulps %ymm9,%ymm15,%ymm0
0x00000000006f0fc8 <+184>: vaddps 0x3db10(%rip),%ymm0,%ymm4
0x00000000006f0fd0 <+192>: vmulps %ymm9,%ymm4,%ymm2
0x00000000006f0fd5 <+197>: vaddps %ymm5,%ymm2,%ymm6
0x00000000006f0fd9 <+201>: vorps %ymm3,%ymm6,%ymm0
0x00000000006f0fdd <+205>: jne 0x6f0fea <_mm256_logf_app(float __vector(8))+218>
0x00000000006f0fdf <+207>: add $0x68,%rsp
0x00000000006f0fe3 <+211>: pop %rax
0x00000000006f0fe4 <+212>: pop %rbp
0x00000000006f0fe5 <+213>: lea -0x8(%rax),%rsp
0x00000000006f0fe9 <+217>: retq
The puzzling thing is this piece:
0x00000000006f0f51 <+65>: mov %fs:0x28,%rax
0x00000000006f0f5a <+74>: mov %rax,-0x18(%rbp)
0x00000000006f0f5e <+78>: xor %eax,%eax
0x00000000006f0f60 <+80>: vpsrld $0x17,%xmm5,%xmm6
0x00000000006f0f65 <+85>: vmovaps %xmm2,-0x50(%rbp)
0x00000000006f0f6a <+90>: mov -0x18(%rbp),%rax
0x00000000006f0f6e <+94>: xor %fs:0x28,%rax
Which looks to me like an access of thread-local storage (fs register, I'm on x64 linux). It pulls from TLS, spills to the stack, then xors with something from TLS again? Why is it doing this? Intel doesn't. This is GCC 7.3
compiled with:
> g++ -std=c++11 -O3 -g3 -Wall -Wextra -fno-omit-frame-pointer -fopenmp -pthread -Idep/opt/include -Iinc/ -ffast-math -fPIC -mavx -funroll-loops -c lib/simd_avx.cc -o lib/simd_avx.o
gcc assembly simd
I'm running down some numeric difference between GCC/ICC and I've noticed something puzzling. When compiling this function (which computes an approximate natural logarithm):
// convenience macros for defining constant vectors
#define MM256_ICONST(val) (__m256)_mm256_set1_epi32(val);
#define MM256_FCONST(val) _mm256_set1_ps (val);
__m256 _mm256_logf_app(__m256 xx) {
// constants
const __m256 mant_mask = MM256_ICONST(0x007FFFFF); // mantissa mask
const __m256 exp_mask = MM256_ICONST(0x3F800000); // 127 in exponent
const __m256 mln2 = MM256_FCONST(M_LN2); // log(2)
// polynomial coefficients
const __m256 a = MM256_FCONST(+3.529304993f);
const __m256 b = MM256_FCONST(-2.461222105f);
const __m256 c = MM256_FCONST(+1.130626167f);
const __m256 d = MM256_FCONST(-0.288739945f);
const __m256 e = MM256_FCONST(+3.110401639e-2f);
const __m256 f = MM256_FCONST(-89.970756366f);
// mask out anything <= 0
__m256 invalid = _mm256_cmp_ps(xx, _mm256_setzero_ps(), _CMP_LE_OQ);
// extract exponents
__m256 exp = _mm256_cvtepi32_ps(
avx2_mm256_srli_epi32((__m256i)xx, 23)
);
// clear exponent to 0 (+127)
xx = _mm256_and_ps(xx, mant_mask);
xx = _mm256_or_ps (xx, exp_mask);
// horner's rule to evaluate polynomial
__m256 ret = e;
ret = _mm256_add_ps(d, _mm256_mul_ps(ret,xx));
ret = _mm256_add_ps(c, _mm256_mul_ps(ret,xx));
ret = _mm256_add_ps(b, _mm256_mul_ps(ret,xx));
ret = _mm256_add_ps(a, _mm256_mul_ps(ret,xx));
ret = _mm256_add_ps(f, _mm256_mul_ps(ret,xx));
// add in exponent contribution
ret = _mm256_add_ps(ret,
_mm256_mul_ps(exp, mln2)
);
return _mm256_or_ps(ret, invalid);
}
It generates this assembly:
0x00000000006f0f10 <+0>: lea 0x8(%rsp),%r10
0x00000000006f0f15 <+5>: and $0xffffffffffffffe0,%rsp
0x00000000006f0f19 <+9>: pushq -0x8(%r10)
0x00000000006f0f1d <+13>: push %rbp
0x00000000006f0f1e <+14>: mov %rsp,%rbp
0x00000000006f0f21 <+17>: push %r10
0x00000000006f0f23 <+19>: sub $0x68,%rsp
0x00000000006f0f27 <+23>: vmovaps %ymm0,-0x70(%rbp)
0x00000000006f0f2c <+28>: vxorps %xmm0,%xmm0,%xmm0
0x00000000006f0f30 <+32>: vmovaps -0x70(%rbp),%ymm4
0x00000000006f0f35 <+37>: vmovdqa -0x70(%rbp),%xmm1
0x00000000006f0f3a <+42>: vandps 0x3dade(%rip),%ymm4,%ymm8
0x00000000006f0f42 <+50>: vcmple_oqps %ymm0,%ymm4,%ymm3
0x00000000006f0f47 <+55>: vmovdqa -0x60(%rbp),%xmm5
0x00000000006f0f4c <+60>: vpsrld $0x17,%xmm1,%xmm2
0x00000000006f0f51 <+65>: mov %fs:0x28,%rax
0x00000000006f0f5a <+74>: mov %rax,-0x18(%rbp)
0x00000000006f0f5e <+78>: xor %eax,%eax
0x00000000006f0f60 <+80>: vpsrld $0x17,%xmm5,%xmm6
0x00000000006f0f65 <+85>: vmovaps %xmm2,-0x50(%rbp)
0x00000000006f0f6a <+90>: mov -0x18(%rbp),%rax
0x00000000006f0f6e <+94>: xor %fs:0x28,%rax
0x00000000006f0f77 <+103>: vmovaps %xmm6,-0x40(%rbp)
0x00000000006f0f7c <+108>: vcvtdq2ps -0x50(%rbp),%ymm7
0x00000000006f0f81 <+113>: vorps 0x3dab7(%rip),%ymm8,%ymm9
0x00000000006f0f89 <+121>: vmulps 0x3db6f(%rip),%ymm7,%ymm1
0x00000000006f0f91 <+129>: vmulps 0x3dac7(%rip),%ymm9,%ymm10
0x00000000006f0f99 <+137>: vaddps 0x3db7f(%rip),%ymm1,%ymm5
0x00000000006f0fa1 <+145>: vaddps 0x3dad7(%rip),%ymm10,%ymm11
0x00000000006f0fa9 <+153>: vmulps %ymm9,%ymm11,%ymm12
0x00000000006f0fae <+158>: vaddps 0x3daea(%rip),%ymm12,%ymm13
0x00000000006f0fb6 <+166>: vmulps %ymm9,%ymm13,%ymm14
0x00000000006f0fbb <+171>: vaddps 0x3dafd(%rip),%ymm14,%ymm15
0x00000000006f0fc3 <+179>: vmulps %ymm9,%ymm15,%ymm0
0x00000000006f0fc8 <+184>: vaddps 0x3db10(%rip),%ymm0,%ymm4
0x00000000006f0fd0 <+192>: vmulps %ymm9,%ymm4,%ymm2
0x00000000006f0fd5 <+197>: vaddps %ymm5,%ymm2,%ymm6
0x00000000006f0fd9 <+201>: vorps %ymm3,%ymm6,%ymm0
0x00000000006f0fdd <+205>: jne 0x6f0fea <_mm256_logf_app(float __vector(8))+218>
0x00000000006f0fdf <+207>: add $0x68,%rsp
0x00000000006f0fe3 <+211>: pop %rax
0x00000000006f0fe4 <+212>: pop %rbp
0x00000000006f0fe5 <+213>: lea -0x8(%rax),%rsp
0x00000000006f0fe9 <+217>: retq
The puzzling thing is this piece:
0x00000000006f0f51 <+65>: mov %fs:0x28,%rax
0x00000000006f0f5a <+74>: mov %rax,-0x18(%rbp)
0x00000000006f0f5e <+78>: xor %eax,%eax
0x00000000006f0f60 <+80>: vpsrld $0x17,%xmm5,%xmm6
0x00000000006f0f65 <+85>: vmovaps %xmm2,-0x50(%rbp)
0x00000000006f0f6a <+90>: mov -0x18(%rbp),%rax
0x00000000006f0f6e <+94>: xor %fs:0x28,%rax
Which looks to me like an access of thread-local storage (fs register, I'm on x64 linux). It pulls from TLS, spills to the stack, then xors with something from TLS again? Why is it doing this? Intel doesn't. This is GCC 7.3
compiled with:
> g++ -std=c++11 -O3 -g3 -Wall -Wextra -fno-omit-frame-pointer -fopenmp -pthread -Idep/opt/include -Iinc/ -ffast-math -fPIC -mavx -funroll-loops -c lib/simd_avx.cc -o lib/simd_avx.o
gcc assembly simd
gcc assembly simd
edited Jan 3 at 0:01
Sean McAllister
asked Jan 2 at 23:56
Sean McAllisterSean McAllister
5,672125184
5,672125184
1
That's the stack protector. Add-fno-stack-protector
. It's not obvious to me why gcc decided to add it here as I can't spot anything that normally triggers it.
– Jester
Jan 3 at 0:04
Ah excellent, thank you. The intel version I'm fighting with is older so it doesn't have stack protection
– Sean McAllister
Jan 3 at 0:05
The only thing inside the protected region is thevmovaps %xmm2,-0x50(%rbp)
which is a store of a known size to an automatic variable allocated by the compiler ... should be safe. There is nothing in the function to warrant a protector unless you have-fstack-protector-all
or-fstack-protector-strong
enabled by default for some reason.
– Jester
Jan 3 at 0:07
Yeah I don't have it on at all as far as I know...
– Sean McAllister
Jan 3 at 0:10
It looks like you compiled with optimization disabled; spillingxx
to the stack on function entry is horrible. If you're looking at the asm for code with intrinsics, definitely use-O1
at least, preferably-O3
. That might even optimize away the stack-protector stuff, because it can spend enough time optimizing to prove that this function doesn't need the stack protector code. And BTW, stack-protector isn't just a newer/older gcc thing. Your distro chose to configure the default setting to-fstack-protector-strong
, but off is another possible setting for modern GCC.
– Peter Cordes
Jan 3 at 0:34
|
show 2 more comments
1
That's the stack protector. Add-fno-stack-protector
. It's not obvious to me why gcc decided to add it here as I can't spot anything that normally triggers it.
– Jester
Jan 3 at 0:04
Ah excellent, thank you. The intel version I'm fighting with is older so it doesn't have stack protection
– Sean McAllister
Jan 3 at 0:05
The only thing inside the protected region is thevmovaps %xmm2,-0x50(%rbp)
which is a store of a known size to an automatic variable allocated by the compiler ... should be safe. There is nothing in the function to warrant a protector unless you have-fstack-protector-all
or-fstack-protector-strong
enabled by default for some reason.
– Jester
Jan 3 at 0:07
Yeah I don't have it on at all as far as I know...
– Sean McAllister
Jan 3 at 0:10
It looks like you compiled with optimization disabled; spillingxx
to the stack on function entry is horrible. If you're looking at the asm for code with intrinsics, definitely use-O1
at least, preferably-O3
. That might even optimize away the stack-protector stuff, because it can spend enough time optimizing to prove that this function doesn't need the stack protector code. And BTW, stack-protector isn't just a newer/older gcc thing. Your distro chose to configure the default setting to-fstack-protector-strong
, but off is another possible setting for modern GCC.
– Peter Cordes
Jan 3 at 0:34
1
1
That's the stack protector. Add
-fno-stack-protector
. It's not obvious to me why gcc decided to add it here as I can't spot anything that normally triggers it.– Jester
Jan 3 at 0:04
That's the stack protector. Add
-fno-stack-protector
. It's not obvious to me why gcc decided to add it here as I can't spot anything that normally triggers it.– Jester
Jan 3 at 0:04
Ah excellent, thank you. The intel version I'm fighting with is older so it doesn't have stack protection
– Sean McAllister
Jan 3 at 0:05
Ah excellent, thank you. The intel version I'm fighting with is older so it doesn't have stack protection
– Sean McAllister
Jan 3 at 0:05
The only thing inside the protected region is the
vmovaps %xmm2,-0x50(%rbp)
which is a store of a known size to an automatic variable allocated by the compiler ... should be safe. There is nothing in the function to warrant a protector unless you have -fstack-protector-all
or -fstack-protector-strong
enabled by default for some reason.– Jester
Jan 3 at 0:07
The only thing inside the protected region is the
vmovaps %xmm2,-0x50(%rbp)
which is a store of a known size to an automatic variable allocated by the compiler ... should be safe. There is nothing in the function to warrant a protector unless you have -fstack-protector-all
or -fstack-protector-strong
enabled by default for some reason.– Jester
Jan 3 at 0:07
Yeah I don't have it on at all as far as I know...
– Sean McAllister
Jan 3 at 0:10
Yeah I don't have it on at all as far as I know...
– Sean McAllister
Jan 3 at 0:10
It looks like you compiled with optimization disabled; spilling
xx
to the stack on function entry is horrible. If you're looking at the asm for code with intrinsics, definitely use -O1
at least, preferably -O3
. That might even optimize away the stack-protector stuff, because it can spend enough time optimizing to prove that this function doesn't need the stack protector code. And BTW, stack-protector isn't just a newer/older gcc thing. Your distro chose to configure the default setting to -fstack-protector-strong
, but off is another possible setting for modern GCC.– Peter Cordes
Jan 3 at 0:34
It looks like you compiled with optimization disabled; spilling
xx
to the stack on function entry is horrible. If you're looking at the asm for code with intrinsics, definitely use -O1
at least, preferably -O3
. That might even optimize away the stack-protector stuff, because it can spend enough time optimizing to prove that this function doesn't need the stack protector code. And BTW, stack-protector isn't just a newer/older gcc thing. Your distro chose to configure the default setting to -fstack-protector-strong
, but off is another possible setting for modern GCC.– Peter Cordes
Jan 3 at 0:34
|
show 2 more comments
0
active
oldest
votes
Your Answer
StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");
StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);
StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});
function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});
}
});
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f54014708%2fgcc-accessing-thread-local-storage-unnecessarily%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
0
active
oldest
votes
0
active
oldest
votes
active
oldest
votes
active
oldest
votes
Thanks for contributing an answer to Stack Overflow!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f54014708%2fgcc-accessing-thread-local-storage-unnecessarily%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
1
That's the stack protector. Add
-fno-stack-protector
. It's not obvious to me why gcc decided to add it here as I can't spot anything that normally triggers it.– Jester
Jan 3 at 0:04
Ah excellent, thank you. The intel version I'm fighting with is older so it doesn't have stack protection
– Sean McAllister
Jan 3 at 0:05
The only thing inside the protected region is the
vmovaps %xmm2,-0x50(%rbp)
which is a store of a known size to an automatic variable allocated by the compiler ... should be safe. There is nothing in the function to warrant a protector unless you have-fstack-protector-all
or-fstack-protector-strong
enabled by default for some reason.– Jester
Jan 3 at 0:07
Yeah I don't have it on at all as far as I know...
– Sean McAllister
Jan 3 at 0:10
It looks like you compiled with optimization disabled; spilling
xx
to the stack on function entry is horrible. If you're looking at the asm for code with intrinsics, definitely use-O1
at least, preferably-O3
. That might even optimize away the stack-protector stuff, because it can spend enough time optimizing to prove that this function doesn't need the stack protector code. And BTW, stack-protector isn't just a newer/older gcc thing. Your distro chose to configure the default setting to-fstack-protector-strong
, but off is another possible setting for modern GCC.– Peter Cordes
Jan 3 at 0:34