[Git][pulseaudio/webrtc-audio-processing][master] Allow disabling inline SSE

Arun Raghavan (@arun) gitlab at gitlab.freedesktop.org
Fri Dec 27 08:26:31 UTC 2024



Arun Raghavan pushed to branch master at PulseAudio / webrtc-audio-processing


Commits:
fed81a77 by Arun Raghavan at 2024-12-26T17:35:08-05:00
Allow disabling inline SSE

Should make building on i686 without SSE feasible.

Fixes: https://gitlab.freedesktop.org/pulseaudio/webrtc-audio-processing/-/issues/5

- - - - -


9 changed files:

- meson.build
- meson_options.txt
- webrtc/modules/audio_processing/aec3/adaptive_fir_filter.cc
- webrtc/modules/audio_processing/aec3/adaptive_fir_filter_erl.cc
- webrtc/modules/audio_processing/aec3/fft_data.h
- webrtc/modules/audio_processing/aec3/matched_filter.cc
- webrtc/modules/audio_processing/aec3/vector_math.h
- webrtc/modules/audio_processing/agc2/rnn_vad/vector_math.h
- webrtc/third_party/pffft/meson.build


Changes:

=====================================
meson.build
=====================================
@@ -110,6 +110,7 @@ have_neon = false
 have_mips = false
 have_mips64 = false
 have_x86 = false
+have_inline_sse = false
 have_avx2 = false
 if host_machine.cpu_family() == 'arm'
   if cc.compiles('''#ifndef __ARM_ARCH_ISA_ARM
@@ -140,10 +141,19 @@ if host_machine.cpu_family() == 'mips64'
 endif
 if ['x86', 'x86_64'].contains(host_machine.cpu_family())
   have_x86 = true
-  # This is unconditionally enabled for now, actual usage is determined by
-  # runtime CPU detection, so we're just assuming the compiler supports avx2
+  # AVX2 support is unconditionally available, since all the code (compiled
+  # with -mavx2) is in separate files from runtime detection (which should not
+  # be compiled with SIMD flags for cases where the CPU does not support it).
+  # Unfortunately, a bunch of SSE code is inline with the runtime detection,
+  # and we can't support that on systems that don't support SSE.
   have_avx2 = true
   arch_cflags += ['-DWEBRTC_ENABLE_AVX2']
+  if get_option('inline-sse')
+    have_inline_sse = true
+  else
+    have_inline_sse = false
+    arch_cflags += ['-DWAP_DISABLE_INLINE_SSE']
+  endif
 endif
 
 neon_opt = get_option('neon')


=====================================
meson_options.txt
=====================================
@@ -3,4 +3,7 @@ option('gnustl', type: 'feature',
        description: 'Use gnustl for a c++ library implementation (only used on Android)')
 option('neon', type: 'combo',
        choices: ['no', 'yes', 'auto', 'runtime'],
-       description: '')
+       description: 'Enable NEON optimisations')
+option('inline-sse', type: 'boolean',
+       value: true,
+       description: 'Enable inline SSE/SSE2 optimisations (i.e. assume CPU supports SSE/SSE2)')


=====================================
webrtc/modules/audio_processing/aec3/adaptive_fir_filter.cc
=====================================
@@ -16,7 +16,7 @@
 #if defined(WEBRTC_HAS_NEON)
 #include <arm_neon.h>
 #endif
-#if defined(WEBRTC_ARCH_X86_FAMILY)
+#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WAP_DISABLE_INLINE_SSE)
 #include <emmintrin.h>
 #endif
 #include <math.h>
@@ -88,7 +88,7 @@ void ComputeFrequencyResponse_Neon(
 }
 #endif
 
-#if defined(WEBRTC_ARCH_X86_FAMILY)
+#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WAP_DISABLE_INLINE_SSE)
 // Computes and stores the frequency response of the filter.
 void ComputeFrequencyResponse_Sse2(
     size_t num_partitions,
@@ -212,7 +212,7 @@ void AdaptPartitions_Neon(const RenderBuffer& render_buffer,
 }
 #endif
 
-#if defined(WEBRTC_ARCH_X86_FAMILY)
+#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WAP_DISABLE_INLINE_SSE)
 // Adapts the filter partitions. (SSE2 variant)
 void AdaptPartitions_Sse2(const RenderBuffer& render_buffer,
                           const FftData& G,
@@ -377,7 +377,7 @@ void ApplyFilter_Neon(const RenderBuffer& render_buffer,
 }
 #endif
 
-#if defined(WEBRTC_ARCH_X86_FAMILY)
+#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WAP_DISABLE_INLINE_SSE)
 // Produces the filter output (SSE2 variant).
 void ApplyFilter_Sse2(const RenderBuffer& render_buffer,
                       size_t num_partitions,
@@ -557,9 +557,11 @@ void AdaptiveFirFilter::Filter(const RenderBuffer& render_buffer,
   RTC_DCHECK(S);
   switch (optimization_) {
 #if defined(WEBRTC_ARCH_X86_FAMILY)
+#if !defined(WAP_DISABLE_INLINE_SSE)
     case Aec3Optimization::kSse2:
       aec3::ApplyFilter_Sse2(render_buffer, current_size_partitions_, H_, S);
       break;
+#endif
     case Aec3Optimization::kAvx2:
       aec3::ApplyFilter_Avx2(render_buffer, current_size_partitions_, H_, S);
       break;
@@ -601,9 +603,11 @@ void AdaptiveFirFilter::ComputeFrequencyResponse(
 
   switch (optimization_) {
 #if defined(WEBRTC_ARCH_X86_FAMILY)
+#if !defined(WAP_DISABLE_INLINE_SSE)
     case Aec3Optimization::kSse2:
       aec3::ComputeFrequencyResponse_Sse2(current_size_partitions_, H_, H2);
       break;
+#endif
     case Aec3Optimization::kAvx2:
       aec3::ComputeFrequencyResponse_Avx2(current_size_partitions_, H_, H2);
       break;
@@ -626,10 +630,12 @@ void AdaptiveFirFilter::AdaptAndUpdateSize(const RenderBuffer& render_buffer,
   // Adapt the filter.
   switch (optimization_) {
 #if defined(WEBRTC_ARCH_X86_FAMILY)
+#if !defined(WAP_DISABLE_INLINE_SSE)
     case Aec3Optimization::kSse2:
       aec3::AdaptPartitions_Sse2(render_buffer, G, current_size_partitions_,
                                  &H_);
       break;
+#endif
     case Aec3Optimization::kAvx2:
       aec3::AdaptPartitions_Avx2(render_buffer, G, current_size_partitions_,
                                  &H_);


=====================================
webrtc/modules/audio_processing/aec3/adaptive_fir_filter_erl.cc
=====================================
@@ -16,7 +16,7 @@
 #if defined(WEBRTC_HAS_NEON)
 #include <arm_neon.h>
 #endif
-#if defined(WEBRTC_ARCH_X86_FAMILY)
+#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WAP_DISABLE_INLINE_SSE)
 #include <emmintrin.h>
 #endif
 
@@ -54,7 +54,7 @@ void ErlComputer_NEON(
 }
 #endif
 
-#if defined(WEBRTC_ARCH_X86_FAMILY)
+#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WAP_DISABLE_INLINE_SSE)
 // Computes and stores the echo return loss estimate of the filter, which is the
 // sum of the partition frequency responses.
 void ErlComputer_SSE2(
@@ -82,9 +82,11 @@ void ComputeErl(const Aec3Optimization& optimization,
   // Update the frequency response and echo return loss for the filter.
   switch (optimization) {
 #if defined(WEBRTC_ARCH_X86_FAMILY)
+#if !defined(WAP_DISABLE_INLINE_SSE)
     case Aec3Optimization::kSse2:
       aec3::ErlComputer_SSE2(H2, erl);
       break;
+#endif
     case Aec3Optimization::kAvx2:
       aec3::ErlComputer_AVX2(H2, erl);
       break;


=====================================
webrtc/modules/audio_processing/aec3/fft_data.h
=====================================
@@ -14,7 +14,7 @@
 // Defines WEBRTC_ARCH_X86_FAMILY, used below.
 #include "rtc_base/system/arch.h"
 
-#if defined(WEBRTC_ARCH_X86_FAMILY)
+#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WAP_DISABLE_INLINE_SSE)
 #include <emmintrin.h>
 #endif
 #include <algorithm>
@@ -49,6 +49,7 @@ struct FftData {
     RTC_DCHECK_EQ(kFftLengthBy2Plus1, power_spectrum.size());
     switch (optimization) {
 #if defined(WEBRTC_ARCH_X86_FAMILY)
+#if !defined(WAP_DISABLE_INLINE_SSE)
       case Aec3Optimization::kSse2: {
         constexpr int kNumFourBinBands = kFftLengthBy2 / 4;
         constexpr int kLimit = kNumFourBinBands * 4;
@@ -63,6 +64,7 @@ struct FftData {
         power_spectrum[kFftLengthBy2] = re[kFftLengthBy2] * re[kFftLengthBy2] +
                                         im[kFftLengthBy2] * im[kFftLengthBy2];
       } break;
+#endif
       case Aec3Optimization::kAvx2:
         SpectrumAVX2(power_spectrum);
         break;


=====================================
webrtc/modules/audio_processing/aec3/matched_filter.cc
=====================================
@@ -15,7 +15,7 @@
 #if defined(WEBRTC_HAS_NEON)
 #include <arm_neon.h>
 #endif
-#if defined(WEBRTC_ARCH_X86_FAMILY)
+#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WAP_DISABLE_INLINE_SSE)
 #include <emmintrin.h>
 #endif
 #include <algorithm>
@@ -286,7 +286,7 @@ void MatchedFilterCore_NEON(size_t x_start_index,
 
 #endif
 
-#if defined(WEBRTC_ARCH_X86_FAMILY)
+#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WAP_DISABLE_INLINE_SSE)
 
 void MatchedFilterCore_AccumulatedError_SSE2(
     size_t x_start_index,
@@ -695,12 +695,14 @@ void MatchedFilter::Update(const DownsampledRenderBuffer& render_buffer,
 
     switch (optimization_) {
 #if defined(WEBRTC_ARCH_X86_FAMILY)
+#if !defined(WAP_DISABLE_INLINE_SSE)
       case Aec3Optimization::kSse2:
         aec3::MatchedFilterCore_SSE2(
             x_start_index, x2_sum_threshold, smoothing, render_buffer.buffer, y,
             filters_[n], &filters_updated, &error_sum, compute_pre_echo,
             instantaneous_accumulated_error_, scratch_memory_);
         break;
+#endif
       case Aec3Optimization::kAvx2:
         aec3::MatchedFilterCore_AVX2(
             x_start_index, x2_sum_threshold, smoothing, render_buffer.buffer, y,


=====================================
webrtc/modules/audio_processing/aec3/vector_math.h
=====================================
@@ -17,7 +17,7 @@
 #if defined(WEBRTC_HAS_NEON)
 #include <arm_neon.h>
 #endif
-#if defined(WEBRTC_ARCH_X86_FAMILY)
+#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WAP_DISABLE_INLINE_SSE)
 #include <emmintrin.h>
 #endif
 #include <math.h>
@@ -43,7 +43,7 @@ class VectorMath {
   void SqrtAVX2(rtc::ArrayView<float> x);
   void Sqrt(rtc::ArrayView<float> x) {
     switch (optimization_) {
-#if defined(WEBRTC_ARCH_X86_FAMILY)
+#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WAP_DISABLE_INLINE_SSE)
       case Aec3Optimization::kSse2: {
         const int x_size = static_cast<int>(x.size());
         const int vector_limit = x_size >> 2;
@@ -123,7 +123,7 @@ class VectorMath {
     RTC_DCHECK_EQ(z.size(), x.size());
     RTC_DCHECK_EQ(z.size(), y.size());
     switch (optimization_) {
-#if defined(WEBRTC_ARCH_X86_FAMILY)
+#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WAP_DISABLE_INLINE_SSE)
       case Aec3Optimization::kSse2: {
         const int x_size = static_cast<int>(x.size());
         const int vector_limit = x_size >> 2;
@@ -174,6 +174,7 @@ class VectorMath {
     RTC_DCHECK_EQ(z.size(), x.size());
     switch (optimization_) {
 #if defined(WEBRTC_ARCH_X86_FAMILY)
+#if !defined(WAP_DISABLE_INLINE_SSE)
       case Aec3Optimization::kSse2: {
         const int x_size = static_cast<int>(x.size());
         const int vector_limit = x_size >> 2;
@@ -190,6 +191,7 @@ class VectorMath {
           z[j] += x[j];
         }
       } break;
+#endif
       case Aec3Optimization::kAvx2:
         AccumulateAVX2(x, z);
         break;


=====================================
webrtc/modules/audio_processing/agc2/rnn_vad/vector_math.h
=====================================
@@ -17,7 +17,7 @@
 #if defined(WEBRTC_HAS_NEON)
 #include <arm_neon.h>
 #endif
-#if defined(WEBRTC_ARCH_X86_FAMILY)
+#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(WAP_DISABLE_INLINE_SSE)
 #include <emmintrin.h>
 #endif
 
@@ -47,6 +47,7 @@ class VectorMath {
     if (cpu_features_.avx2) {
       return DotProductAvx2(x, y);
     } else if (cpu_features_.sse2) {
+#if !defined(WAP_DISABLE_INLINE_SSE)
       __m128 accumulator = _mm_setzero_ps();
       constexpr int kBlockSizeLog2 = 2;
       constexpr int kBlockSize = 1 << kBlockSizeLog2;
@@ -72,6 +73,7 @@ class VectorMath {
         dot_product += x[i] * y[i];
       }
       return dot_product;
+#endif
     }
 #elif defined(WEBRTC_HAS_NEON) && defined(WEBRTC_ARCH_ARM64)
     if (cpu_features_.neon) {


=====================================
webrtc/third_party/pffft/meson.build
=====================================
@@ -4,7 +4,7 @@ pffft_sources = [
 
 pffft_cflags = [ '-D_GNU_SOURCE' ]
 
-if (have_arm and not have_neon) or (have_mips and host_machine.endian() == 'little') or have_mips64
+if not have_inline_sse or (have_arm and not have_neon) or (have_mips and host_machine.endian() == 'little') or have_mips64
   pffft_cflags += [ '-DPFFFT_SIMD_DISABLE' ]
 endif
 



View it on GitLab: https://gitlab.freedesktop.org/pulseaudio/webrtc-audio-processing/-/commit/fed81a77c9a9bc366556f732324cdc5f9e7b09e9

-- 
View it on GitLab: https://gitlab.freedesktop.org/pulseaudio/webrtc-audio-processing/-/commit/fed81a77c9a9bc366556f732324cdc5f9e7b09e9
You're receiving this email because of your account on gitlab.freedesktop.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/pulseaudio-commits/attachments/20241227/f7f0ad5b/attachment-0001.htm>


More information about the pulseaudio-commits mailing list