core/mixer/mixer_sse.cpp

   1 #include "config.h"
   2
   3 #include <xmmintrin.h>
   4
   5 #include <cmath>
   6 #include <limits>
   7
   8 #include "alnumeric.h"
   9 #include "core/bsinc_defs.h"
  10 #include "core/cubic_defs.h"
  11 #include "defs.h"
  12 #include "hrtfbase.h"
  13
  14 struct SSETag;
  15 struct CubicTag;
  16 struct BSincTag;
  17 struct FastBSincTag;
  18
  19
  20 #if defined(__GNUC__) && !defined(__clang__) && !defined(__SSE__)
  21 #pragma GCC target("sse")
  22 #endif
  23
  24 namespace {
  25
  26 constexpr uint BSincPhaseDiffBits{MixerFracBits - BSincPhaseBits};
  27 constexpr uint BSincPhaseDiffOne{1 << BSincPhaseDiffBits};
  28 constexpr uint BSincPhaseDiffMask{BSincPhaseDiffOne - 1u};
  29
  30 constexpr uint CubicPhaseDiffBits{MixerFracBits - CubicPhaseBits};
  31 constexpr uint CubicPhaseDiffOne{1 << CubicPhaseDiffBits};
  32 constexpr uint CubicPhaseDiffMask{CubicPhaseDiffOne - 1u};
  33
  34 #define MLA4(x, y, z) _mm_add_ps(x, _mm_mul_ps(y, z))
  35
  36 inline void ApplyCoeffs(float2 *RESTRICT Values, const size_t IrSize, const ConstHrirSpan Coeffs,
  37     const float left, const float right)
  38 {
  39     const __m128 lrlr{_mm_setr_ps(left, right, left, right)};
  40
  41     ASSUME(IrSize >= MinIrLength);
  42     /* This isn't technically correct to test alignment, but it's true for
  43      * systems that support SSE, which is the only one that needs to know the
  44      * alignment of Values (which alternates between 8- and 16-byte aligned).
  45      */
  46     if(!(reinterpret_cast<uintptr_t>(Values)&15))
  47     {
  48         for(size_t i{0};i < IrSize;i += 2)
  49         {
  50             const __m128 coeffs{_mm_load_ps(Coeffs[i].data())};
  51             __m128 vals{_mm_load_ps(Values[i].data())};
  52             vals = MLA4(vals, lrlr, coeffs);
  53             _mm_store_ps(Values[i].data(), vals);
  54         }
  55     }
  56     else
  57     {
  58         __m128 imp0, imp1;
  59         __m128 coeffs{_mm_load_ps(Coeffs[0].data())};
  60         __m128 vals{_mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64*>(Values[0].data()))};
  61         imp0 = _mm_mul_ps(lrlr, coeffs);
  62         vals = _mm_add_ps(imp0, vals);
  63         _mm_storel_pi(reinterpret_cast<__m64*>(Values[0].data()), vals);
  64         size_t td{((IrSize+1)>>1) - 1};
  65         size_t i{1};
  66         do {
  67             coeffs = _mm_load_ps(Coeffs[i+1].data());
  68             vals = _mm_load_ps(Values[i].data());
  69             imp1 = _mm_mul_ps(lrlr, coeffs);
  70             imp0 = _mm_shuffle_ps(imp0, imp1, _MM_SHUFFLE(1, 0, 3, 2));
  71             vals = _mm_add_ps(imp0, vals);
  72             _mm_store_ps(Values[i].data(), vals);
  73             imp0 = imp1;
  74             i += 2;
  75         } while(--td);
  76         vals = _mm_loadl_pi(vals, reinterpret_cast<__m64*>(Values[i].data()));
  77         imp0 = _mm_movehl_ps(imp0, imp0);
  78         vals = _mm_add_ps(imp0, vals);
  79         _mm_storel_pi(reinterpret_cast<__m64*>(Values[i].data()), vals);
  80     }
  81 }
  82
  83 force_inline void MixLine(const al::span<const float> InSamples, float *RESTRICT dst,
  84     float &CurrentGain, const float TargetGain, const float delta, const size_t min_len,
  85     const size_t aligned_len, size_t Counter)
  86 {
  87     float gain{CurrentGain};
  88     const float step{(TargetGain-gain) * delta};
  89
  90     size_t pos{0};
  91     if(!(std::abs(step) > std::numeric_limits<float>::epsilon()))
  92         gain = TargetGain;
  93     else
  94     {
  95         float step_count{0.0f};
  96         /* Mix with applying gain steps in aligned multiples of 4. */
  97         if(size_t todo{min_len >> 2})
  98         {
  99             const __m128 four4{_mm_set1_ps(4.0f)};
 100             const __m128 step4{_mm_set1_ps(step)};
 101             const __m128 gain4{_mm_set1_ps(gain)};
 102             __m128 step_count4{_mm_setr_ps(0.0f, 1.0f, 2.0f, 3.0f)};
 103             do {
 104                 const __m128 val4{_mm_load_ps(&InSamples[pos])};
 105                 __m128 dry4{_mm_load_ps(&dst[pos])};
 106
 107                 /* dry += val * (gain + step*step_count) */
 108                 dry4 = MLA4(dry4, val4, MLA4(gain4, step4, step_count4));
 109
 110                 _mm_store_ps(&dst[pos], dry4);
 111                 step_count4 = _mm_add_ps(step_count4, four4);
 112                 pos += 4;
 113             } while(--todo);
 114             /* NOTE: step_count4 now represents the next four counts after the
 115              * last four mixed samples, so the lowest element represents the
 116              * next step count to apply.
 117              */
 118             step_count = _mm_cvtss_f32(step_count4);
 119         }
 120         /* Mix with applying left over gain steps that aren't aligned multiples of 4. */
 121         for(size_t leftover{min_len&3};leftover;++pos,--leftover)
 122         {
 123             dst[pos] += InSamples[pos] * (gain + step*step_count);
 124             step_count += 1.0f;
 125         }
 126         if(pos == Counter)
 127             gain = TargetGain;
 128         else
 129             gain += step*step_count;
 130
 131         /* Mix until pos is aligned with 4 or the mix is done. */
 132         for(size_t leftover{aligned_len&3};leftover;++pos,--leftover)
 133             dst[pos] += InSamples[pos] * gain;
 134     }
 135     CurrentGain = gain;
 136
 137     if(!(std::abs(gain) > GainSilenceThreshold))
 138         return;
 139     if(size_t todo{(InSamples.size()-pos) >> 2})
 140     {
 141         const __m128 gain4{_mm_set1_ps(gain)};
 142         do {
 143             const __m128 val4{_mm_load_ps(&InSamples[pos])};
 144             __m128 dry4{_mm_load_ps(&dst[pos])};
 145             dry4 = _mm_add_ps(dry4, _mm_mul_ps(val4, gain4));
 146             _mm_store_ps(&dst[pos], dry4);
 147             pos += 4;
 148         } while(--todo);
 149     }
 150     for(size_t leftover{(InSamples.size()-pos)&3};leftover;++pos,--leftover)
 151         dst[pos] += InSamples[pos] * gain;
 152 }
 153
 154 } // namespace
 155
 156 template<>
 157 void Resample_<CubicTag,SSETag>(const InterpState *state, const float *RESTRICT src, uint frac,
 158     const uint increment, const al::span<float> dst)
 159 {
 160     ASSUME(frac < MixerFracOne);
 161
 162     const CubicCoefficients *RESTRICT filter = al::assume_aligned<16>(state->cubic.filter);
 163
 164     src -= 1;
 165     for(float &out_sample : dst)
 166     {
 167         const uint pi{frac >> CubicPhaseDiffBits};
 168         const float pf{static_cast<float>(frac&CubicPhaseDiffMask) * (1.0f/CubicPhaseDiffOne)};
 169         const __m128 pf4{_mm_set1_ps(pf)};
 170
 171         /* Apply the phase interpolated filter. */
 172
 173         /* f = fil + pf*phd */
 174         const __m128 f4 = MLA4(_mm_load_ps(filter[pi].mCoeffs), pf4,
 175             _mm_load_ps(filter[pi].mDeltas));
 176         /* r = f*src */
 177         __m128 r4{_mm_mul_ps(f4, _mm_loadu_ps(src))};
 178
 179         r4 = _mm_add_ps(r4, _mm_shuffle_ps(r4, r4, _MM_SHUFFLE(0, 1, 2, 3)));
 180         r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
 181         out_sample = _mm_cvtss_f32(r4);
 182
 183         frac += increment;
 184         src  += frac>>MixerFracBits;
 185         frac &= MixerFracMask;
 186     }
 187 }
 188
 189 template<>
 190 void Resample_<BSincTag,SSETag>(const InterpState *state, const float *RESTRICT src, uint frac,
 191     const uint increment, const al::span<float> dst)
 192 {
 193     const float *const filter{state->bsinc.filter};
 194     const __m128 sf4{_mm_set1_ps(state->bsinc.sf)};
 195     const size_t m{state->bsinc.m};
 196     ASSUME(m > 0);
 197     ASSUME(frac < MixerFracOne);
 198
 199     src -= state->bsinc.l;
 200     for(float &out_sample : dst)
 201     {
 202         // Calculate the phase index and factor.
 203         const uint pi{frac >> BSincPhaseDiffBits};
 204         const float pf{static_cast<float>(frac&BSincPhaseDiffMask) * (1.0f/BSincPhaseDiffOne)};
 205
 206         // Apply the scale and phase interpolated filter.
 207         __m128 r4{_mm_setzero_ps()};
 208         {
 209             const __m128 pf4{_mm_set1_ps(pf)};
 210             const float *RESTRICT fil{filter + m*pi*2};
 211             const float *RESTRICT phd{fil + m};
 212             const float *RESTRICT scd{fil + BSincPhaseCount*2*m};
 213             const float *RESTRICT spd{scd + m};
 214             size_t td{m >> 2};
 215             size_t j{0u};
 216
 217             do {
 218                 /* f = ((fil + sf*scd) + pf*(phd + sf*spd)) */
 219                 const __m128 f4 = MLA4(
 220                     MLA4(_mm_load_ps(&fil[j]), sf4, _mm_load_ps(&scd[j])),
 221                     pf4, MLA4(_mm_load_ps(&phd[j]), sf4, _mm_load_ps(&spd[j])));
 222                 /* r += f*src */
 223                 r4 = MLA4(r4, f4, _mm_loadu_ps(&src[j]));
 224                 j += 4;
 225             } while(--td);
 226         }
 227         r4 = _mm_add_ps(r4, _mm_shuffle_ps(r4, r4, _MM_SHUFFLE(0, 1, 2, 3)));
 228         r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
 229         out_sample = _mm_cvtss_f32(r4);
 230
 231         frac += increment;
 232         src  += frac>>MixerFracBits;
 233         frac &= MixerFracMask;
 234     }
 235 }
 236
 237 template<>
 238 void Resample_<FastBSincTag,SSETag>(const InterpState *state, const float *RESTRICT src, uint frac,
 239     const uint increment, const al::span<float> dst)
 240 {
 241     const float *const filter{state->bsinc.filter};
 242     const size_t m{state->bsinc.m};
 243     ASSUME(m > 0);
 244     ASSUME(frac < MixerFracOne);
 245
 246     src -= state->bsinc.l;
 247     for(float &out_sample : dst)
 248     {
 249         // Calculate the phase index and factor.
 250         const uint pi{frac >> BSincPhaseDiffBits};
 251         const float pf{static_cast<float>(frac&BSincPhaseDiffMask) * (1.0f/BSincPhaseDiffOne)};
 252
 253         // Apply the phase interpolated filter.
 254         __m128 r4{_mm_setzero_ps()};
 255         {
 256             const __m128 pf4{_mm_set1_ps(pf)};
 257             const float *RESTRICT fil{filter + m*pi*2};
 258             const float *RESTRICT phd{fil + m};
 259             size_t td{m >> 2};
 260             size_t j{0u};
 261
 262             do {
 263                 /* f = fil + pf*phd */
 264                 const __m128 f4 = MLA4(_mm_load_ps(&fil[j]), pf4, _mm_load_ps(&phd[j]));
 265                 /* r += f*src */
 266                 r4 = MLA4(r4, f4, _mm_loadu_ps(&src[j]));
 267                 j += 4;
 268             } while(--td);
 269         }
 270         r4 = _mm_add_ps(r4, _mm_shuffle_ps(r4, r4, _MM_SHUFFLE(0, 1, 2, 3)));
 271         r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
 272         out_sample = _mm_cvtss_f32(r4);
 273
 274         frac += increment;
 275         src  += frac>>MixerFracBits;
 276         frac &= MixerFracMask;
 277     }
 278 }
 279
 280
 281 template<>
 282 void MixHrtf_<SSETag>(const float *InSamples, float2 *AccumSamples, const uint IrSize,
 283     const MixHrtfFilter *hrtfparams, const size_t BufferSize)
 284 { MixHrtfBase<ApplyCoeffs>(InSamples, AccumSamples, IrSize, hrtfparams, BufferSize); }
 285
 286 template<>
 287 void MixHrtfBlend_<SSETag>(const float *InSamples, float2 *AccumSamples, const uint IrSize,
 288     const HrtfFilter *oldparams, const MixHrtfFilter *newparams, const size_t BufferSize)
 289 {
 290     MixHrtfBlendBase<ApplyCoeffs>(InSamples, AccumSamples, IrSize, oldparams, newparams,
 291         BufferSize);
 292 }
 293
 294 template<>
 295 void MixDirectHrtf_<SSETag>(const FloatBufferSpan LeftOut, const FloatBufferSpan RightOut,
 296     const al::span<const FloatBufferLine> InSamples, float2 *AccumSamples,
 297     float *TempBuf, HrtfChannelState *ChanState, const size_t IrSize, const size_t BufferSize)
 298 {
 299     MixDirectHrtfBase<ApplyCoeffs>(LeftOut, RightOut, InSamples, AccumSamples, TempBuf, ChanState,
 300         IrSize, BufferSize);
 301 }
 302
 303
 304 template<>
 305 void Mix_<SSETag>(const al::span<const float> InSamples, const al::span<FloatBufferLine> OutBuffer,
 306     float *CurrentGains, const float *TargetGains, const size_t Counter, const size_t OutPos)
 307 {
 308     const float delta{(Counter > 0) ? 1.0f / static_cast<float>(Counter) : 0.0f};
 309     const auto min_len = minz(Counter, InSamples.size());
 310     const auto aligned_len = minz((min_len+3) & ~size_t{3}, InSamples.size()) - min_len;
 311
 312     for(FloatBufferLine &output : OutBuffer)
 313         MixLine(InSamples, al::assume_aligned<16>(output.data()+OutPos), *CurrentGains++,
 314             *TargetGains++, delta, min_len, aligned_len, Counter);
 315 }
 316
 317 template<>
 318 void Mix_<SSETag>(const al::span<const float> InSamples, float *OutBuffer, float &CurrentGain,
 319     const float TargetGain, const size_t Counter)
 320 {
 321     const float delta{(Counter > 0) ? 1.0f / static_cast<float>(Counter) : 0.0f};
 322     const auto min_len = minz(Counter, InSamples.size());
 323     const auto aligned_len = minz((min_len+3) & ~size_t{3}, InSamples.size()) - min_len;
 324
 325     MixLine(InSamples, al::assume_aligned<16>(OutBuffer), CurrentGain, TargetGain, delta, min_len,
 326         aligned_len, Counter);
 327 }