Using % with SSE2?
Here's the code I'm trying to convert to SSE2:
double *pA = a;
double *pB = b[voiceIndex];
double *pC = c[voiceIndex];
double *left = audioLeft;
double *right = audioRight;
double phase = 0.0;
double bp0 = mNoteFrequency * mHostPitch;
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex++) {
// some other code (that will use phase)
phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
while (phase >= TWOPI) { phase -= TWOPI; }
}
Here's what I've achieved:
double *pA = a;
double *pB = b[voiceIndex];
double *pC = c[voiceIndex];
double *left = audioLeft;
double *right = audioRight;
double phase = 0.0;
double bp0 = mNoteFrequency * mHostPitch;
__m128d v_boundLower = _mm_set1_pd(0.0);
__m128d v_boundUpper = _mm_set1_pd(PI);
__m128d v_bp0 = _mm_set1_pd(bp0);
__m128d v_radiansPerSample = _mm_set1_pd(mRadiansPerSample);
__m128d v_phase = _mm_set1_pd(phase);
__m128d v_pB = _mm_load_pd(pB);
__m128d v_pC = _mm_load_pd(pC);
__m128d v_result = _mm_mul_pd(v_bp0, v_pB);
v_result = _mm_add_pd(v_result, v_pC);
v_result = _mm_mul_pd(v_result, v_radiansPerSample);
v_result = _mm_max_pd(v_result, v_boundLower);
v_result = _mm_min_pd(v_result, v_boundUpper);
for (int sampleIndex = 0; sampleIndex < roundintup8(blockSize); sampleIndex += 8, pB += 8, pC += 8) {
// some other code (that will use v_phase)
v_phase = _mm_add_pd(v_phase, v_result);
v_pB = _mm_load_pd(pB + 2);
v_pC = _mm_load_pd(pC + 2);
v_result = _mm_mul_pd(v_bp0, v_pB);
v_result = _mm_add_pd(v_result, v_pC);
v_result = _mm_mul_pd(v_result, v_radiansPerSample);
v_result = _mm_max_pd(v_result, v_boundLower);
v_result = _mm_min_pd(v_result, v_boundUpper);
v_phase = _mm_add_pd(v_phase, v_result);
v_pB = _mm_load_pd(pB + 4);
v_pC = _mm_load_pd(pC + 4);
v_result = _mm_mul_pd(v_bp0, v_pB);
v_result = _mm_add_pd(v_result, v_pC);
v_result = _mm_mul_pd(v_result, v_radiansPerSample);
v_result = _mm_max_pd(v_result, v_boundLower);
v_result = _mm_min_pd(v_result, v_boundUpper);
v_phase = _mm_add_pd(v_phase, v_result);
v_pB = _mm_load_pd(pB + 6);
v_pC = _mm_load_pd(pC + 6);
v_result = _mm_mul_pd(v_bp0, v_pB);
v_result = _mm_add_pd(v_result, v_pC);
v_result = _mm_mul_pd(v_result, v_radiansPerSample);
v_result = _mm_max_pd(v_result, v_boundLower);
v_result = _mm_min_pd(v_result, v_boundUpper);
v_phase = _mm_add_pd(v_phase, v_result);
v_pB = _mm_load_pd(pB + 8);
v_pC = _mm_load_pd(pC + 8);
v_result = _mm_mul_pd(v_bp0, v_pB);
v_result = _mm_add_pd(v_result, v_pC);
v_result = _mm_mul_pd(v_result, v_radiansPerSample);
v_result = _mm_max_pd(v_result, v_boundLower);
v_result = _mm_min_pd(v_result, v_boundUpper);
// ... fmod?
}
But I'm not really sure how to replace while (phase >= TWOPI) { phase -= TWOPI; }
(which is basically a classic fmod
in C++).
Any fancy intrinsics? Can't find any on this list.
Division + some sort of rocket bit-shifting?
c++ intrinsics sse2
|
show 3 more comments
Here's the code I'm trying to convert to SSE2:
double *pA = a;
double *pB = b[voiceIndex];
double *pC = c[voiceIndex];
double *left = audioLeft;
double *right = audioRight;
double phase = 0.0;
double bp0 = mNoteFrequency * mHostPitch;
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex++) {
// some other code (that will use phase)
phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
while (phase >= TWOPI) { phase -= TWOPI; }
}
Here's what I've achieved:
double *pA = a;
double *pB = b[voiceIndex];
double *pC = c[voiceIndex];
double *left = audioLeft;
double *right = audioRight;
double phase = 0.0;
double bp0 = mNoteFrequency * mHostPitch;
__m128d v_boundLower = _mm_set1_pd(0.0);
__m128d v_boundUpper = _mm_set1_pd(PI);
__m128d v_bp0 = _mm_set1_pd(bp0);
__m128d v_radiansPerSample = _mm_set1_pd(mRadiansPerSample);
__m128d v_phase = _mm_set1_pd(phase);
__m128d v_pB = _mm_load_pd(pB);
__m128d v_pC = _mm_load_pd(pC);
__m128d v_result = _mm_mul_pd(v_bp0, v_pB);
v_result = _mm_add_pd(v_result, v_pC);
v_result = _mm_mul_pd(v_result, v_radiansPerSample);
v_result = _mm_max_pd(v_result, v_boundLower);
v_result = _mm_min_pd(v_result, v_boundUpper);
for (int sampleIndex = 0; sampleIndex < roundintup8(blockSize); sampleIndex += 8, pB += 8, pC += 8) {
// some other code (that will use v_phase)
v_phase = _mm_add_pd(v_phase, v_result);
v_pB = _mm_load_pd(pB + 2);
v_pC = _mm_load_pd(pC + 2);
v_result = _mm_mul_pd(v_bp0, v_pB);
v_result = _mm_add_pd(v_result, v_pC);
v_result = _mm_mul_pd(v_result, v_radiansPerSample);
v_result = _mm_max_pd(v_result, v_boundLower);
v_result = _mm_min_pd(v_result, v_boundUpper);
v_phase = _mm_add_pd(v_phase, v_result);
v_pB = _mm_load_pd(pB + 4);
v_pC = _mm_load_pd(pC + 4);
v_result = _mm_mul_pd(v_bp0, v_pB);
v_result = _mm_add_pd(v_result, v_pC);
v_result = _mm_mul_pd(v_result, v_radiansPerSample);
v_result = _mm_max_pd(v_result, v_boundLower);
v_result = _mm_min_pd(v_result, v_boundUpper);
v_phase = _mm_add_pd(v_phase, v_result);
v_pB = _mm_load_pd(pB + 6);
v_pC = _mm_load_pd(pC + 6);
v_result = _mm_mul_pd(v_bp0, v_pB);
v_result = _mm_add_pd(v_result, v_pC);
v_result = _mm_mul_pd(v_result, v_radiansPerSample);
v_result = _mm_max_pd(v_result, v_boundLower);
v_result = _mm_min_pd(v_result, v_boundUpper);
v_phase = _mm_add_pd(v_phase, v_result);
v_pB = _mm_load_pd(pB + 8);
v_pC = _mm_load_pd(pC + 8);
v_result = _mm_mul_pd(v_bp0, v_pB);
v_result = _mm_add_pd(v_result, v_pC);
v_result = _mm_mul_pd(v_result, v_radiansPerSample);
v_result = _mm_max_pd(v_result, v_boundLower);
v_result = _mm_min_pd(v_result, v_boundUpper);
// ... fmod?
}
But I'm not really sure how to replace while (phase >= TWOPI) { phase -= TWOPI; }
(which is basically a classic fmod
in C++).
Any fancy intrinsics? Can't find any on this list.
Division + some sort of rocket bit-shifting?
c++ intrinsics sse2
1
How often do you expect to apply this correction though, and how many two pis do you need to subtract when it does happen? Maybe compare and subtract is still the way to go. (Not that I know SSE2.)
– Rup
Jan 2 at 10:18
@Rup: can't know :) It depends howphase
will grown up...
– markzzz
Jan 2 at 10:29
1
Are you even sure auto-vectorization does not do it for you? Did you check generated code when you enable SSE2 on your original source (eg, with-msse2
on g++)?
– spectras
Jan 2 at 11:15
3
Thatwhile
could be changed into anif
becausephase
will never be more than 3pi at that point.
– interjay
Jan 2 at 11:18
3
@markzzz I see. You can still use godbolt to check what gcc does and understand (and perhaps improve) upon it.
– spectras
Jan 2 at 12:05
|
show 3 more comments
Here's the code I'm trying to convert to SSE2:
double *pA = a;
double *pB = b[voiceIndex];
double *pC = c[voiceIndex];
double *left = audioLeft;
double *right = audioRight;
double phase = 0.0;
double bp0 = mNoteFrequency * mHostPitch;
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex++) {
// some other code (that will use phase)
phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
while (phase >= TWOPI) { phase -= TWOPI; }
}
Here's what I've achieved:
double *pA = a;
double *pB = b[voiceIndex];
double *pC = c[voiceIndex];
double *left = audioLeft;
double *right = audioRight;
double phase = 0.0;
double bp0 = mNoteFrequency * mHostPitch;
__m128d v_boundLower = _mm_set1_pd(0.0);
__m128d v_boundUpper = _mm_set1_pd(PI);
__m128d v_bp0 = _mm_set1_pd(bp0);
__m128d v_radiansPerSample = _mm_set1_pd(mRadiansPerSample);
__m128d v_phase = _mm_set1_pd(phase);
__m128d v_pB = _mm_load_pd(pB);
__m128d v_pC = _mm_load_pd(pC);
__m128d v_result = _mm_mul_pd(v_bp0, v_pB);
v_result = _mm_add_pd(v_result, v_pC);
v_result = _mm_mul_pd(v_result, v_radiansPerSample);
v_result = _mm_max_pd(v_result, v_boundLower);
v_result = _mm_min_pd(v_result, v_boundUpper);
for (int sampleIndex = 0; sampleIndex < roundintup8(blockSize); sampleIndex += 8, pB += 8, pC += 8) {
// some other code (that will use v_phase)
v_phase = _mm_add_pd(v_phase, v_result);
v_pB = _mm_load_pd(pB + 2);
v_pC = _mm_load_pd(pC + 2);
v_result = _mm_mul_pd(v_bp0, v_pB);
v_result = _mm_add_pd(v_result, v_pC);
v_result = _mm_mul_pd(v_result, v_radiansPerSample);
v_result = _mm_max_pd(v_result, v_boundLower);
v_result = _mm_min_pd(v_result, v_boundUpper);
v_phase = _mm_add_pd(v_phase, v_result);
v_pB = _mm_load_pd(pB + 4);
v_pC = _mm_load_pd(pC + 4);
v_result = _mm_mul_pd(v_bp0, v_pB);
v_result = _mm_add_pd(v_result, v_pC);
v_result = _mm_mul_pd(v_result, v_radiansPerSample);
v_result = _mm_max_pd(v_result, v_boundLower);
v_result = _mm_min_pd(v_result, v_boundUpper);
v_phase = _mm_add_pd(v_phase, v_result);
v_pB = _mm_load_pd(pB + 6);
v_pC = _mm_load_pd(pC + 6);
v_result = _mm_mul_pd(v_bp0, v_pB);
v_result = _mm_add_pd(v_result, v_pC);
v_result = _mm_mul_pd(v_result, v_radiansPerSample);
v_result = _mm_max_pd(v_result, v_boundLower);
v_result = _mm_min_pd(v_result, v_boundUpper);
v_phase = _mm_add_pd(v_phase, v_result);
v_pB = _mm_load_pd(pB + 8);
v_pC = _mm_load_pd(pC + 8);
v_result = _mm_mul_pd(v_bp0, v_pB);
v_result = _mm_add_pd(v_result, v_pC);
v_result = _mm_mul_pd(v_result, v_radiansPerSample);
v_result = _mm_max_pd(v_result, v_boundLower);
v_result = _mm_min_pd(v_result, v_boundUpper);
// ... fmod?
}
But I'm not really sure how to replace while (phase >= TWOPI) { phase -= TWOPI; }
(which is basically a classic fmod
in C++).
Any fancy intrinsics? Can't find any on this list.
Division + some sort of rocket bit-shifting?
c++ intrinsics sse2
Here's the code I'm trying to convert to SSE2:
double *pA = a;
double *pB = b[voiceIndex];
double *pC = c[voiceIndex];
double *left = audioLeft;
double *right = audioRight;
double phase = 0.0;
double bp0 = mNoteFrequency * mHostPitch;
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex++) {
// some other code (that will use phase)
phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
while (phase >= TWOPI) { phase -= TWOPI; }
}
Here's what I've achieved:
double *pA = a;
double *pB = b[voiceIndex];
double *pC = c[voiceIndex];
double *left = audioLeft;
double *right = audioRight;
double phase = 0.0;
double bp0 = mNoteFrequency * mHostPitch;
__m128d v_boundLower = _mm_set1_pd(0.0);
__m128d v_boundUpper = _mm_set1_pd(PI);
__m128d v_bp0 = _mm_set1_pd(bp0);
__m128d v_radiansPerSample = _mm_set1_pd(mRadiansPerSample);
__m128d v_phase = _mm_set1_pd(phase);
__m128d v_pB = _mm_load_pd(pB);
__m128d v_pC = _mm_load_pd(pC);
__m128d v_result = _mm_mul_pd(v_bp0, v_pB);
v_result = _mm_add_pd(v_result, v_pC);
v_result = _mm_mul_pd(v_result, v_radiansPerSample);
v_result = _mm_max_pd(v_result, v_boundLower);
v_result = _mm_min_pd(v_result, v_boundUpper);
for (int sampleIndex = 0; sampleIndex < roundintup8(blockSize); sampleIndex += 8, pB += 8, pC += 8) {
// some other code (that will use v_phase)
v_phase = _mm_add_pd(v_phase, v_result);
v_pB = _mm_load_pd(pB + 2);
v_pC = _mm_load_pd(pC + 2);
v_result = _mm_mul_pd(v_bp0, v_pB);
v_result = _mm_add_pd(v_result, v_pC);
v_result = _mm_mul_pd(v_result, v_radiansPerSample);
v_result = _mm_max_pd(v_result, v_boundLower);
v_result = _mm_min_pd(v_result, v_boundUpper);
v_phase = _mm_add_pd(v_phase, v_result);
v_pB = _mm_load_pd(pB + 4);
v_pC = _mm_load_pd(pC + 4);
v_result = _mm_mul_pd(v_bp0, v_pB);
v_result = _mm_add_pd(v_result, v_pC);
v_result = _mm_mul_pd(v_result, v_radiansPerSample);
v_result = _mm_max_pd(v_result, v_boundLower);
v_result = _mm_min_pd(v_result, v_boundUpper);
v_phase = _mm_add_pd(v_phase, v_result);
v_pB = _mm_load_pd(pB + 6);
v_pC = _mm_load_pd(pC + 6);
v_result = _mm_mul_pd(v_bp0, v_pB);
v_result = _mm_add_pd(v_result, v_pC);
v_result = _mm_mul_pd(v_result, v_radiansPerSample);
v_result = _mm_max_pd(v_result, v_boundLower);
v_result = _mm_min_pd(v_result, v_boundUpper);
v_phase = _mm_add_pd(v_phase, v_result);
v_pB = _mm_load_pd(pB + 8);
v_pC = _mm_load_pd(pC + 8);
v_result = _mm_mul_pd(v_bp0, v_pB);
v_result = _mm_add_pd(v_result, v_pC);
v_result = _mm_mul_pd(v_result, v_radiansPerSample);
v_result = _mm_max_pd(v_result, v_boundLower);
v_result = _mm_min_pd(v_result, v_boundUpper);
// ... fmod?
}
But I'm not really sure how to replace while (phase >= TWOPI) { phase -= TWOPI; }
(which is basically a classic fmod
in C++).
Any fancy intrinsics? Can't find any on this list.
Division + some sort of rocket bit-shifting?
c++ intrinsics sse2
c++ intrinsics sse2
edited Jan 2 at 11:11
markzzz
asked Jan 2 at 10:11
markzzzmarkzzz
19.2k94234408
19.2k94234408
1
How often do you expect to apply this correction though, and how many two pis do you need to subtract when it does happen? Maybe compare and subtract is still the way to go. (Not that I know SSE2.)
– Rup
Jan 2 at 10:18
@Rup: can't know :) It depends howphase
will grown up...
– markzzz
Jan 2 at 10:29
1
Are you even sure auto-vectorization does not do it for you? Did you check generated code when you enable SSE2 on your original source (eg, with-msse2
on g++)?
– spectras
Jan 2 at 11:15
3
Thatwhile
could be changed into anif
becausephase
will never be more than 3pi at that point.
– interjay
Jan 2 at 11:18
3
@markzzz I see. You can still use godbolt to check what gcc does and understand (and perhaps improve) upon it.
– spectras
Jan 2 at 12:05
|
show 3 more comments
1
How often do you expect to apply this correction though, and how many two pis do you need to subtract when it does happen? Maybe compare and subtract is still the way to go. (Not that I know SSE2.)
– Rup
Jan 2 at 10:18
@Rup: can't know :) It depends howphase
will grown up...
– markzzz
Jan 2 at 10:29
1
Are you even sure auto-vectorization does not do it for you? Did you check generated code when you enable SSE2 on your original source (eg, with-msse2
on g++)?
– spectras
Jan 2 at 11:15
3
Thatwhile
could be changed into anif
becausephase
will never be more than 3pi at that point.
– interjay
Jan 2 at 11:18
3
@markzzz I see. You can still use godbolt to check what gcc does and understand (and perhaps improve) upon it.
– spectras
Jan 2 at 12:05
1
1
How often do you expect to apply this correction though, and how many two pis do you need to subtract when it does happen? Maybe compare and subtract is still the way to go. (Not that I know SSE2.)
– Rup
Jan 2 at 10:18
How often do you expect to apply this correction though, and how many two pis do you need to subtract when it does happen? Maybe compare and subtract is still the way to go. (Not that I know SSE2.)
– Rup
Jan 2 at 10:18
@Rup: can't know :) It depends how
phase
will grown up...– markzzz
Jan 2 at 10:29
@Rup: can't know :) It depends how
phase
will grown up...– markzzz
Jan 2 at 10:29
1
1
Are you even sure auto-vectorization does not do it for you? Did you check generated code when you enable SSE2 on your original source (eg, with
-msse2
on g++)?– spectras
Jan 2 at 11:15
Are you even sure auto-vectorization does not do it for you? Did you check generated code when you enable SSE2 on your original source (eg, with
-msse2
on g++)?– spectras
Jan 2 at 11:15
3
3
That
while
could be changed into an if
because phase
will never be more than 3pi at that point.– interjay
Jan 2 at 11:18
That
while
could be changed into an if
because phase
will never be more than 3pi at that point.– interjay
Jan 2 at 11:18
3
3
@markzzz I see. You can still use godbolt to check what gcc does and understand (and perhaps improve) upon it.
– spectras
Jan 2 at 12:05
@markzzz I see. You can still use godbolt to check what gcc does and understand (and perhaps improve) upon it.
– spectras
Jan 2 at 12:05
|
show 3 more comments
1 Answer
1
active
oldest
votes
As comments are saying, it looks like in this you can make it just a masked subtract with a compare + andpd
. This works as long as you can never be more than one subtract away from getting back into the desired range.
Like
const __m128d v2pi = _mm_set1_pd(TWOPI);
__m128d needs_range_reduction = _mm_cmpge_pd(vphase, v2pi);
__m128d offset = _mm_and_pd(needs_range_reduction, v2pi); // 0.0 or 2*Pi
vphase = _mm_sub_pd(vphase, offset);
To implement an actual (slow) fmod
without worrying too much about the last few bits of the significand, you'd do integer_quotient = floor(x/y)
(or maybe rint(x/y)
or ceil
), then x - y * integer_quotient
. floor
/ rint
/ ceil
are cheap with SSE4.1 _mm_round_pd
or _mm_floor_pd()
. That will give you the remainder, which can be negative just like with integer division.
I'm sure there are numerical techniques that better avoid rounding error before the catastrophic cancellation from subtracting two nearby numbers. If you care about precision, go check. (Using double
vectors when you don't care much about precision is kind of silly; might as well use float
and get twice as much work done per vector). If the input is a lot larger than the modulus, there's an unavoidable loss of precision, and minimizing rounding error in the temporary is probably very important. But otherwise precision will only be a problem unless you care about relative error in results very close to zero when x
is almost an exact multiple of y
. (Near-zero result, the only the bottom few bits of the significand are left for precision.)
Without SSE4.1, there are tricks like adding then subtracting a large enough number. Converting to integer and back is even worse for pd
, because the packed-conversion instruction decodes to some shuffle uops as well. Not to mention that a 32-bit integer doesn't cover the full range of double
, but you're screwed for range-reduction precision if your input was that huge.
If you have FMA, you can avoid rounding error in the y * integer_quotient
part of the multiply and sub. _mm_fmsub_pd
.
my whole code above I think its totally wrong.phase
is comulative for each step, instead I start with both value of__m128d v_phase
with_mm_set1_pd(phase);
(both equal, which is wrong).
– markzzz
Jan 2 at 14:55
@markzzz: sounds a lot like your previous questions, where you want to run a vector of sequential counters. start withphase
andphase+step
, and increment byset1(step*2)
each time. If you don't have (or can factor out) a serial dependency between two vector elements, you're all set. Of course, that makes the max increment 2*pi, I think, so you might need to set a tighter bound or have an alternate loop that repeats the conditional subtract in case it needs to be done multiple times. (e.g. if you unroll even more so you can add up to4*pi
)
– Peter Cordes
Jan 2 at 15:00
sure! The problem here is thatstep
its not fixed, since it can vary on each iteration :O
– markzzz
Jan 2 at 15:02
I mean: it doesn't seems I have any serial dependency...
– markzzz
Jan 2 at 15:04
@markzzz: ah right, I just looked at your code again. It's not exactly a serial dependency, but stepping by 2 depends on both intervening elements. You could calculate redundantly with overlapping unaligned loads, I think, and usepC[i+0]+pC[i+1]
in one element andpC[i+1] +
pC[i+2]` in another. Same forpB
. (Your formula,(mRadiansPerSample * bp0) * pB[i] + mRadiansPerSample * pC[i]
is linear, so I think adding the pC and pB inputs before multiplying will give the same result.)
– Peter Cordes
Jan 2 at 15:19
|
show 3 more comments
Your Answer
StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");
StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);
StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});
function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});
}
});
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f54004452%2fusing-with-sse2%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
1 Answer
1
active
oldest
votes
1 Answer
1
active
oldest
votes
active
oldest
votes
active
oldest
votes
As comments are saying, it looks like in this you can make it just a masked subtract with a compare + andpd
. This works as long as you can never be more than one subtract away from getting back into the desired range.
Like
const __m128d v2pi = _mm_set1_pd(TWOPI);
__m128d needs_range_reduction = _mm_cmpge_pd(vphase, v2pi);
__m128d offset = _mm_and_pd(needs_range_reduction, v2pi); // 0.0 or 2*Pi
vphase = _mm_sub_pd(vphase, offset);
To implement an actual (slow) fmod
without worrying too much about the last few bits of the significand, you'd do integer_quotient = floor(x/y)
(or maybe rint(x/y)
or ceil
), then x - y * integer_quotient
. floor
/ rint
/ ceil
are cheap with SSE4.1 _mm_round_pd
or _mm_floor_pd()
. That will give you the remainder, which can be negative just like with integer division.
I'm sure there are numerical techniques that better avoid rounding error before the catastrophic cancellation from subtracting two nearby numbers. If you care about precision, go check. (Using double
vectors when you don't care much about precision is kind of silly; might as well use float
and get twice as much work done per vector). If the input is a lot larger than the modulus, there's an unavoidable loss of precision, and minimizing rounding error in the temporary is probably very important. But otherwise precision will only be a problem unless you care about relative error in results very close to zero when x
is almost an exact multiple of y
. (Near-zero result, the only the bottom few bits of the significand are left for precision.)
Without SSE4.1, there are tricks like adding then subtracting a large enough number. Converting to integer and back is even worse for pd
, because the packed-conversion instruction decodes to some shuffle uops as well. Not to mention that a 32-bit integer doesn't cover the full range of double
, but you're screwed for range-reduction precision if your input was that huge.
If you have FMA, you can avoid rounding error in the y * integer_quotient
part of the multiply and sub. _mm_fmsub_pd
.
my whole code above I think its totally wrong.phase
is comulative for each step, instead I start with both value of__m128d v_phase
with_mm_set1_pd(phase);
(both equal, which is wrong).
– markzzz
Jan 2 at 14:55
@markzzz: sounds a lot like your previous questions, where you want to run a vector of sequential counters. start withphase
andphase+step
, and increment byset1(step*2)
each time. If you don't have (or can factor out) a serial dependency between two vector elements, you're all set. Of course, that makes the max increment 2*pi, I think, so you might need to set a tighter bound or have an alternate loop that repeats the conditional subtract in case it needs to be done multiple times. (e.g. if you unroll even more so you can add up to4*pi
)
– Peter Cordes
Jan 2 at 15:00
sure! The problem here is thatstep
its not fixed, since it can vary on each iteration :O
– markzzz
Jan 2 at 15:02
I mean: it doesn't seems I have any serial dependency...
– markzzz
Jan 2 at 15:04
@markzzz: ah right, I just looked at your code again. It's not exactly a serial dependency, but stepping by 2 depends on both intervening elements. You could calculate redundantly with overlapping unaligned loads, I think, and usepC[i+0]+pC[i+1]
in one element andpC[i+1] +
pC[i+2]` in another. Same forpB
. (Your formula,(mRadiansPerSample * bp0) * pB[i] + mRadiansPerSample * pC[i]
is linear, so I think adding the pC and pB inputs before multiplying will give the same result.)
– Peter Cordes
Jan 2 at 15:19
|
show 3 more comments
As comments are saying, it looks like in this you can make it just a masked subtract with a compare + andpd
. This works as long as you can never be more than one subtract away from getting back into the desired range.
Like
const __m128d v2pi = _mm_set1_pd(TWOPI);
__m128d needs_range_reduction = _mm_cmpge_pd(vphase, v2pi);
__m128d offset = _mm_and_pd(needs_range_reduction, v2pi); // 0.0 or 2*Pi
vphase = _mm_sub_pd(vphase, offset);
To implement an actual (slow) fmod
without worrying too much about the last few bits of the significand, you'd do integer_quotient = floor(x/y)
(or maybe rint(x/y)
or ceil
), then x - y * integer_quotient
. floor
/ rint
/ ceil
are cheap with SSE4.1 _mm_round_pd
or _mm_floor_pd()
. That will give you the remainder, which can be negative just like with integer division.
I'm sure there are numerical techniques that better avoid rounding error before the catastrophic cancellation from subtracting two nearby numbers. If you care about precision, go check. (Using double
vectors when you don't care much about precision is kind of silly; might as well use float
and get twice as much work done per vector). If the input is a lot larger than the modulus, there's an unavoidable loss of precision, and minimizing rounding error in the temporary is probably very important. But otherwise precision will only be a problem unless you care about relative error in results very close to zero when x
is almost an exact multiple of y
. (Near-zero result, the only the bottom few bits of the significand are left for precision.)
Without SSE4.1, there are tricks like adding then subtracting a large enough number. Converting to integer and back is even worse for pd
, because the packed-conversion instruction decodes to some shuffle uops as well. Not to mention that a 32-bit integer doesn't cover the full range of double
, but you're screwed for range-reduction precision if your input was that huge.
If you have FMA, you can avoid rounding error in the y * integer_quotient
part of the multiply and sub. _mm_fmsub_pd
.
my whole code above I think its totally wrong.phase
is comulative for each step, instead I start with both value of__m128d v_phase
with_mm_set1_pd(phase);
(both equal, which is wrong).
– markzzz
Jan 2 at 14:55
@markzzz: sounds a lot like your previous questions, where you want to run a vector of sequential counters. start withphase
andphase+step
, and increment byset1(step*2)
each time. If you don't have (or can factor out) a serial dependency between two vector elements, you're all set. Of course, that makes the max increment 2*pi, I think, so you might need to set a tighter bound or have an alternate loop that repeats the conditional subtract in case it needs to be done multiple times. (e.g. if you unroll even more so you can add up to4*pi
)
– Peter Cordes
Jan 2 at 15:00
sure! The problem here is thatstep
its not fixed, since it can vary on each iteration :O
– markzzz
Jan 2 at 15:02
I mean: it doesn't seems I have any serial dependency...
– markzzz
Jan 2 at 15:04
@markzzz: ah right, I just looked at your code again. It's not exactly a serial dependency, but stepping by 2 depends on both intervening elements. You could calculate redundantly with overlapping unaligned loads, I think, and usepC[i+0]+pC[i+1]
in one element andpC[i+1] +
pC[i+2]` in another. Same forpB
. (Your formula,(mRadiansPerSample * bp0) * pB[i] + mRadiansPerSample * pC[i]
is linear, so I think adding the pC and pB inputs before multiplying will give the same result.)
– Peter Cordes
Jan 2 at 15:19
|
show 3 more comments
As comments are saying, it looks like in this you can make it just a masked subtract with a compare + andpd
. This works as long as you can never be more than one subtract away from getting back into the desired range.
Like
const __m128d v2pi = _mm_set1_pd(TWOPI);
__m128d needs_range_reduction = _mm_cmpge_pd(vphase, v2pi);
__m128d offset = _mm_and_pd(needs_range_reduction, v2pi); // 0.0 or 2*Pi
vphase = _mm_sub_pd(vphase, offset);
To implement an actual (slow) fmod
without worrying too much about the last few bits of the significand, you'd do integer_quotient = floor(x/y)
(or maybe rint(x/y)
or ceil
), then x - y * integer_quotient
. floor
/ rint
/ ceil
are cheap with SSE4.1 _mm_round_pd
or _mm_floor_pd()
. That will give you the remainder, which can be negative just like with integer division.
I'm sure there are numerical techniques that better avoid rounding error before the catastrophic cancellation from subtracting two nearby numbers. If you care about precision, go check. (Using double
vectors when you don't care much about precision is kind of silly; might as well use float
and get twice as much work done per vector). If the input is a lot larger than the modulus, there's an unavoidable loss of precision, and minimizing rounding error in the temporary is probably very important. But otherwise precision will only be a problem unless you care about relative error in results very close to zero when x
is almost an exact multiple of y
. (Near-zero result, the only the bottom few bits of the significand are left for precision.)
Without SSE4.1, there are tricks like adding then subtracting a large enough number. Converting to integer and back is even worse for pd
, because the packed-conversion instruction decodes to some shuffle uops as well. Not to mention that a 32-bit integer doesn't cover the full range of double
, but you're screwed for range-reduction precision if your input was that huge.
If you have FMA, you can avoid rounding error in the y * integer_quotient
part of the multiply and sub. _mm_fmsub_pd
.
As comments are saying, it looks like in this you can make it just a masked subtract with a compare + andpd
. This works as long as you can never be more than one subtract away from getting back into the desired range.
Like
const __m128d v2pi = _mm_set1_pd(TWOPI);
__m128d needs_range_reduction = _mm_cmpge_pd(vphase, v2pi);
__m128d offset = _mm_and_pd(needs_range_reduction, v2pi); // 0.0 or 2*Pi
vphase = _mm_sub_pd(vphase, offset);
To implement an actual (slow) fmod
without worrying too much about the last few bits of the significand, you'd do integer_quotient = floor(x/y)
(or maybe rint(x/y)
or ceil
), then x - y * integer_quotient
. floor
/ rint
/ ceil
are cheap with SSE4.1 _mm_round_pd
or _mm_floor_pd()
. That will give you the remainder, which can be negative just like with integer division.
I'm sure there are numerical techniques that better avoid rounding error before the catastrophic cancellation from subtracting two nearby numbers. If you care about precision, go check. (Using double
vectors when you don't care much about precision is kind of silly; might as well use float
and get twice as much work done per vector). If the input is a lot larger than the modulus, there's an unavoidable loss of precision, and minimizing rounding error in the temporary is probably very important. But otherwise precision will only be a problem unless you care about relative error in results very close to zero when x
is almost an exact multiple of y
. (Near-zero result, the only the bottom few bits of the significand are left for precision.)
Without SSE4.1, there are tricks like adding then subtracting a large enough number. Converting to integer and back is even worse for pd
, because the packed-conversion instruction decodes to some shuffle uops as well. Not to mention that a 32-bit integer doesn't cover the full range of double
, but you're screwed for range-reduction precision if your input was that huge.
If you have FMA, you can avoid rounding error in the y * integer_quotient
part of the multiply and sub. _mm_fmsub_pd
.
edited Jan 2 at 13:55
llllllllll
13.8k41742
13.8k41742
answered Jan 2 at 12:50
Peter CordesPeter Cordes
132k18201338
132k18201338
my whole code above I think its totally wrong.phase
is comulative for each step, instead I start with both value of__m128d v_phase
with_mm_set1_pd(phase);
(both equal, which is wrong).
– markzzz
Jan 2 at 14:55
@markzzz: sounds a lot like your previous questions, where you want to run a vector of sequential counters. start withphase
andphase+step
, and increment byset1(step*2)
each time. If you don't have (or can factor out) a serial dependency between two vector elements, you're all set. Of course, that makes the max increment 2*pi, I think, so you might need to set a tighter bound or have an alternate loop that repeats the conditional subtract in case it needs to be done multiple times. (e.g. if you unroll even more so you can add up to4*pi
)
– Peter Cordes
Jan 2 at 15:00
sure! The problem here is thatstep
its not fixed, since it can vary on each iteration :O
– markzzz
Jan 2 at 15:02
I mean: it doesn't seems I have any serial dependency...
– markzzz
Jan 2 at 15:04
@markzzz: ah right, I just looked at your code again. It's not exactly a serial dependency, but stepping by 2 depends on both intervening elements. You could calculate redundantly with overlapping unaligned loads, I think, and usepC[i+0]+pC[i+1]
in one element andpC[i+1] +
pC[i+2]` in another. Same forpB
. (Your formula,(mRadiansPerSample * bp0) * pB[i] + mRadiansPerSample * pC[i]
is linear, so I think adding the pC and pB inputs before multiplying will give the same result.)
– Peter Cordes
Jan 2 at 15:19
|
show 3 more comments
my whole code above I think its totally wrong.phase
is comulative for each step, instead I start with both value of__m128d v_phase
with_mm_set1_pd(phase);
(both equal, which is wrong).
– markzzz
Jan 2 at 14:55
@markzzz: sounds a lot like your previous questions, where you want to run a vector of sequential counters. start withphase
andphase+step
, and increment byset1(step*2)
each time. If you don't have (or can factor out) a serial dependency between two vector elements, you're all set. Of course, that makes the max increment 2*pi, I think, so you might need to set a tighter bound or have an alternate loop that repeats the conditional subtract in case it needs to be done multiple times. (e.g. if you unroll even more so you can add up to4*pi
)
– Peter Cordes
Jan 2 at 15:00
sure! The problem here is thatstep
its not fixed, since it can vary on each iteration :O
– markzzz
Jan 2 at 15:02
I mean: it doesn't seems I have any serial dependency...
– markzzz
Jan 2 at 15:04
@markzzz: ah right, I just looked at your code again. It's not exactly a serial dependency, but stepping by 2 depends on both intervening elements. You could calculate redundantly with overlapping unaligned loads, I think, and usepC[i+0]+pC[i+1]
in one element andpC[i+1] +
pC[i+2]` in another. Same forpB
. (Your formula,(mRadiansPerSample * bp0) * pB[i] + mRadiansPerSample * pC[i]
is linear, so I think adding the pC and pB inputs before multiplying will give the same result.)
– Peter Cordes
Jan 2 at 15:19
my whole code above I think its totally wrong.
phase
is comulative for each step, instead I start with both value of __m128d v_phase
with _mm_set1_pd(phase);
(both equal, which is wrong).– markzzz
Jan 2 at 14:55
my whole code above I think its totally wrong.
phase
is comulative for each step, instead I start with both value of __m128d v_phase
with _mm_set1_pd(phase);
(both equal, which is wrong).– markzzz
Jan 2 at 14:55
@markzzz: sounds a lot like your previous questions, where you want to run a vector of sequential counters. start with
phase
and phase+step
, and increment by set1(step*2)
each time. If you don't have (or can factor out) a serial dependency between two vector elements, you're all set. Of course, that makes the max increment 2*pi, I think, so you might need to set a tighter bound or have an alternate loop that repeats the conditional subtract in case it needs to be done multiple times. (e.g. if you unroll even more so you can add up to 4*pi
)– Peter Cordes
Jan 2 at 15:00
@markzzz: sounds a lot like your previous questions, where you want to run a vector of sequential counters. start with
phase
and phase+step
, and increment by set1(step*2)
each time. If you don't have (or can factor out) a serial dependency between two vector elements, you're all set. Of course, that makes the max increment 2*pi, I think, so you might need to set a tighter bound or have an alternate loop that repeats the conditional subtract in case it needs to be done multiple times. (e.g. if you unroll even more so you can add up to 4*pi
)– Peter Cordes
Jan 2 at 15:00
sure! The problem here is that
step
its not fixed, since it can vary on each iteration :O– markzzz
Jan 2 at 15:02
sure! The problem here is that
step
its not fixed, since it can vary on each iteration :O– markzzz
Jan 2 at 15:02
I mean: it doesn't seems I have any serial dependency...
– markzzz
Jan 2 at 15:04
I mean: it doesn't seems I have any serial dependency...
– markzzz
Jan 2 at 15:04
@markzzz: ah right, I just looked at your code again. It's not exactly a serial dependency, but stepping by 2 depends on both intervening elements. You could calculate redundantly with overlapping unaligned loads, I think, and use
pC[i+0]+pC[i+1]
in one element and pC[i+1] +
pC[i+2]` in another. Same for pB
. (Your formula, (mRadiansPerSample * bp0) * pB[i] + mRadiansPerSample * pC[i]
is linear, so I think adding the pC and pB inputs before multiplying will give the same result.)– Peter Cordes
Jan 2 at 15:19
@markzzz: ah right, I just looked at your code again. It's not exactly a serial dependency, but stepping by 2 depends on both intervening elements. You could calculate redundantly with overlapping unaligned loads, I think, and use
pC[i+0]+pC[i+1]
in one element and pC[i+1] +
pC[i+2]` in another. Same for pB
. (Your formula, (mRadiansPerSample * bp0) * pB[i] + mRadiansPerSample * pC[i]
is linear, so I think adding the pC and pB inputs before multiplying will give the same result.)– Peter Cordes
Jan 2 at 15:19
|
show 3 more comments
Thanks for contributing an answer to Stack Overflow!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f54004452%2fusing-with-sse2%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
1
How often do you expect to apply this correction though, and how many two pis do you need to subtract when it does happen? Maybe compare and subtract is still the way to go. (Not that I know SSE2.)
– Rup
Jan 2 at 10:18
@Rup: can't know :) It depends how
phase
will grown up...– markzzz
Jan 2 at 10:29
1
Are you even sure auto-vectorization does not do it for you? Did you check generated code when you enable SSE2 on your original source (eg, with
-msse2
on g++)?– spectras
Jan 2 at 11:15
3
That
while
could be changed into anif
becausephase
will never be more than 3pi at that point.– interjay
Jan 2 at 11:18
3
@markzzz I see. You can still use godbolt to check what gcc does and understand (and perhaps improve) upon it.
– spectras
Jan 2 at 12:05