[Liboil-commit] liboil/sse

David Schleef ds at kemper.freedesktop.org
Mon May 12 16:48:28 PDT 2008


 liboil/sse/clamp_sse.c          |   27 +++++++++++++------------
 liboil/sse/composite_sse.c      |   10 +++++----
 liboil/sse/composite_sse_2pix.c |   39 ++++++++++++++++++------------------
 liboil/sse/composite_sse_4pix.c |   43 +++++++++++++++++++---------------------
 liboil/sse/copy_sse.c           |    6 +++--
 liboil/sse/math_sse.c           |   28 +++++++++++++-------------
 liboil/sse/math_sse_unroll2.c   |   24 ++++++++++++----------
 liboil/sse/multsum_sse.c        |    4 ++-
 liboil/sse/sad8x8_sse.c         |    4 +--
 liboil/sse/splat_sse.c          |   10 +++++----
 10 files changed, 105 insertions(+), 90 deletions(-)

New commits:
commit 99c2ef9c10b6735592a715a17919690d617c3a4f
Author: David Schleef <ds at ginger.bigkitten.com>
Date:   Mon May 12 16:47:37 2008 -0700

    Use __attribute__((force_align_arg_pointer)) for SSE code.
    Patch from Mike Smith.  Partial fix for #11145

diff --git a/liboil/sse/clamp_sse.c b/liboil/sse/clamp_sse.c
index 5d34c6a..06c8ae7 100644
--- a/liboil/sse/clamp_sse.c
+++ b/liboil/sse/clamp_sse.c
@@ -32,7 +32,10 @@
 #include <emmintrin.h>
 #include <xmmintrin.h>
 
-static void
+/* TODO: If we have gcc 4.2 or above, do this. Otherwise, disable all SSE use */
+#define SSE_FUNCTION __attribute__((force_align_arg_pointer))
+
+SSE_FUNCTION static void
 clamp_u8_sse (uint8_t *dest, uint8_t *src1, int n, uint8_t *src2_1,
     uint8_t *src3_1)
 {
@@ -71,7 +74,7 @@ clamp_u8_sse (uint8_t *dest, uint8_t *src1, int n, uint8_t *src2_1,
 }
 OIL_DEFINE_IMPL_FULL (clamp_u8_sse, clamp_u8, OIL_IMPL_FLAG_SSE2);
 
-static void
+SSE_FUNCTION static void
 clamp_s16_sse (int16_t *dest, int16_t *src1, int n, int16_t *src2_1,
     int16_t *src3_1)
 {
@@ -110,7 +113,7 @@ clamp_s16_sse (int16_t *dest, int16_t *src1, int n, int16_t *src2_1,
 }
 OIL_DEFINE_IMPL_FULL (clamp_s16_sse, clamp_s16, OIL_IMPL_FLAG_SSE2);
 
-static void
+SSE_FUNCTION static void
 clamp_f32_sse (float *dest, const float *src1, int n, const float *src2_1,
     const float *src3_1)
 {
@@ -149,7 +152,7 @@ clamp_f32_sse (float *dest, const float *src1, int n, const float *src2_1,
 }
 OIL_DEFINE_IMPL_FULL (clamp_f32_sse, clamp_f32, OIL_IMPL_FLAG_SSE);
 
-static void
+SSE_FUNCTION static void
 clamp_f64_sse (double *dest, const double *src1, int n, const double *src2_1,
     const double *src3_1)
 {
@@ -189,7 +192,7 @@ clamp_f64_sse (double *dest, const double *src1, int n, const double *src2_1,
 OIL_DEFINE_IMPL_FULL (clamp_f64_sse, clamp_f64,
     OIL_IMPL_FLAG_SSE | OIL_IMPL_FLAG_SSE2);
 
-static void
+SSE_FUNCTION static void
 clamplow_u8_sse (uint8_t *dest, const uint8_t *src1, int n,
     const uint8_t *src2_1)
 {
@@ -221,7 +224,7 @@ clamplow_u8_sse (uint8_t *dest, const uint8_t *src1, int n,
 }
 OIL_DEFINE_IMPL_FULL (clamplow_u8_sse, clamplow_u8, OIL_IMPL_FLAG_SSE2);
 
-static void
+SSE_FUNCTION static void
 clamplow_s16_sse (int16_t *dest, const int16_t *src1, int n,
     const int16_t *src2_1)
 {
@@ -253,7 +256,7 @@ clamplow_s16_sse (int16_t *dest, const int16_t *src1, int n,
 }
 OIL_DEFINE_IMPL_FULL (clamplow_s16_sse, clamplow_s16, OIL_IMPL_FLAG_SSE2);
 
-static void
+SSE_FUNCTION static void
 clamplow_f32_sse (float *dest, const float *src1, int n, const float *src2_1)
 {
   __m128 xmm1;
@@ -284,7 +287,7 @@ clamplow_f32_sse (float *dest, const float *src1, int n, const float *src2_1)
 }
 OIL_DEFINE_IMPL_FULL (clamplow_f32_sse, clamplow_f32, OIL_IMPL_FLAG_SSE);
 
-static void
+SSE_FUNCTION static void
 clamplow_f64_sse (double *dest, const double *src1, int n, const double *src2_1)
 {
   __m128d xmm1;
@@ -316,7 +319,7 @@ clamplow_f64_sse (double *dest, const double *src1, int n, const double *src2_1)
 OIL_DEFINE_IMPL_FULL (clamplow_f64_sse, clamplow_f64,
     OIL_IMPL_FLAG_SSE | OIL_IMPL_FLAG_SSE2);
 
-static void
+SSE_FUNCTION static void
 clamphigh_u8_sse (uint8_t *dest, const uint8_t *src1, int n,
     const uint8_t *src2_1)
 {
@@ -348,7 +351,7 @@ clamphigh_u8_sse (uint8_t *dest, const uint8_t *src1, int n,
 }
 OIL_DEFINE_IMPL_FULL (clamphigh_u8_sse, clamphigh_u8, OIL_IMPL_FLAG_SSE2);
 
-static void
+SSE_FUNCTION static void
 clamphigh_s16_sse (int16_t *dest, const int16_t *src1, int n,
     const int16_t *src2_1)
 {
@@ -380,7 +383,7 @@ clamphigh_s16_sse (int16_t *dest, const int16_t *src1, int n,
 }
 OIL_DEFINE_IMPL_FULL (clamphigh_s16_sse, clamphigh_s16, OIL_IMPL_FLAG_SSE2);
 
-static void
+SSE_FUNCTION static void
 clamphigh_f32_sse (float *dest, const float *src1, int n, const float *src2_1)
 {
   __m128 xmm1;
@@ -411,7 +414,7 @@ clamphigh_f32_sse (float *dest, const float *src1, int n, const float *src2_1)
 }
 OIL_DEFINE_IMPL_FULL (clamphigh_f32_sse, clamphigh_f32, OIL_IMPL_FLAG_SSE);
 
-static void
+SSE_FUNCTION static void
 clamphigh_f64_sse (double *dest, const double *src1, int n, const double *src2_1)
 {
   __m128d xmm1;
diff --git a/liboil/sse/composite_sse.c b/liboil/sse/composite_sse.c
index 307fd17..ce749cf 100644
--- a/liboil/sse/composite_sse.c
+++ b/liboil/sse/composite_sse.c
@@ -32,9 +32,11 @@
 #include <emmintrin.h>
 #include <liboil/liboilcolorspace.h>
 
+#define SSE_FUNCTION __attribute__((force_align_arg_pointer))
+
 #define COMPOSITE_ADD(d,s) oil_clamp_255((d) + (s))
 
-static void
+SSE_FUNCTION static void
 composite_add_argb_sse (uint32_t *dest, const uint32_t *src, int n)
 {
   /* Initial operations to align the destination pointer */
@@ -67,7 +69,7 @@ composite_add_argb_sse (uint32_t *dest, const uint32_t *src, int n)
 OIL_DEFINE_IMPL_FULL (composite_add_argb_sse, composite_add_argb,
     OIL_IMPL_FLAG_SSE2);
 
-static void
+SSE_FUNCTION static void
 composite_add_argb_const_src_sse (uint32_t *dest, const uint32_t *src_1, int n)
 {
   __m128i s;
@@ -103,7 +105,7 @@ composite_add_argb_const_src_sse (uint32_t *dest, const uint32_t *src_1, int n)
 OIL_DEFINE_IMPL_FULL (composite_add_argb_const_src_sse,
     composite_add_argb_const_src, OIL_IMPL_FLAG_SSE2);
 
-static void
+SSE_FUNCTION static void
 composite_add_u8_sse (uint8_t *dest, const uint8_t *src, int n)
 {
   /* Initial operations to align the destination pointer */
@@ -131,7 +133,7 @@ composite_add_u8_sse (uint8_t *dest, const uint8_t *src, int n)
 OIL_DEFINE_IMPL_FULL (composite_add_u8_sse, composite_add_u8,
     OIL_IMPL_FLAG_SSE2);
 
-static void
+SSE_FUNCTION static void
 composite_add_u8_const_src_sse (uint8_t *dest, const uint8_t *src_1, int n)
 {
   __m128i s;
diff --git a/liboil/sse/composite_sse_2pix.c b/liboil/sse/composite_sse_2pix.c
index 13f2cf4..2d19475 100644
--- a/liboil/sse/composite_sse_2pix.c
+++ b/liboil/sse/composite_sse_2pix.c
@@ -32,7 +32,7 @@
 #include <emmintrin.h>
 #include <liboil/liboilcolorspace.h>
 
-#include "sse_wrapper.h"
+#define SSE_FUNCTION __attribute__((force_align_arg_pointer))
 
 /* non-SSE2 compositing support */
 #define COMPOSITE_OVER(d,s,m) ((d) + (s) - oil_muldiv_255((d),(m)))
@@ -63,7 +63,7 @@ static const struct _SSEData {
 /* Shuffles the given value such that the alpha for each pixel appears in each
  * channel of the pixel.
  */
-static inline __m128i
+SSE_FUNCTION static inline __m128i
 argb_A_sse2(__m128i a)
 {
   a = _mm_shufflehi_epi16(a, _MM_SHUFFLE(3,3,3,3));
@@ -74,7 +74,7 @@ argb_A_sse2(__m128i a)
 /* Multiplies the pixel data in a channel-by-channel by b, and divides the
  * result by 255, with rounding.
  */
-static inline __m128i
+SSE_FUNCTION static inline __m128i
 muldiv_255_sse2(__m128i a, __m128i b)
 {
   __m128i ret;
@@ -88,14 +88,14 @@ muldiv_255_sse2(__m128i a, __m128i b)
   return ret;
 }
 
-static inline __m128i
+SSE_FUNCTION static inline __m128i
 negate_argb_sse2(__m128i a)
 {
   return _mm_xor_si128(a, MC(8x00ff));
 }
 
 /* Loads the 2 (unaligned) pixels at *src into unpacked SSE2 registers */
-static inline __m128i
+SSE_FUNCTION static inline __m128i
 load_argb_sse2(const uint32_t *src)
 {
   __m128i pix;
@@ -105,7 +105,7 @@ load_argb_sse2(const uint32_t *src)
   return pix;
 }
 
-static inline __m128i
+SSE_FUNCTION static inline __m128i
 set1_argb_sse2(uint32_t src)
 {
   __m128i pix;
@@ -115,32 +115,33 @@ set1_argb_sse2(uint32_t src)
   return pix;
 }
 
-static inline __m128i
+SSE_FUNCTION static inline __m128i
 load_u8_mask(const uint8_t *m)
 {
   return _mm_unpacklo_epi64(_mm_set1_epi16(m[0]), _mm_set1_epi16(m[1]));
 }
 
-static inline __m128i
+SSE_FUNCTION static inline __m128i
 set1_u8_mask(uint8_t m)
 {
   return _mm_unpacklo_epi8(_mm_set1_epi8(m), _mm_setzero_si128());
 }
 
 /* Stores the 2 unpacked pixels in pix into the (unaligned) *dest */
-static void
+SSE_FUNCTION static void
 store_argb_sse2(uint32_t *dest, __m128i pix)
 {
   pix = _mm_packus_epi16(pix, pix);
   _mm_storel_epi64((__m128i *)dest, pix);
 }
 
-static __m128i over_argb_sse2(__m128i dest, __m128i src, __m128i srca)
+SSE_FUNCTION static __m128i 
+over_argb_sse2(__m128i dest, __m128i src, __m128i srca)
 {
   return _mm_adds_epu8(src, muldiv_255_sse2(dest, negate_argb_sse2(srca)));
 }
 
-static void
+SSE_FUNCTION static void
 composite_in_argb_sse_2pix (uint32_t *dest, const uint32_t *src,
     const uint8_t *mask, int n)
 {
@@ -167,7 +168,7 @@ composite_in_argb_sse_2pix (uint32_t *dest, const uint32_t *src,
 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_argb_sse_2pix, composite_in_argb,
     OIL_IMPL_FLAG_SSE2);
 
-static void
+SSE_FUNCTION static void
 composite_in_argb_const_src_sse_2pix (uint32_t *dest, const uint32_t *src,
     const uint8_t *mask, int n)
 {
@@ -196,7 +197,7 @@ OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_argb_const_src_sse_2pix,
     composite_in_argb_const_src, OIL_IMPL_FLAG_SSE2);
 
 #ifdef SSE_ALIGN
-static void
+SSE_FUNCTION static void
 composite_in_argb_const_mask_sse_2pix (uint32_t *dest, const uint32_t *src,
     const uint8_t *mask, int n)
 {
@@ -225,7 +226,7 @@ OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_argb_const_mask_sse_2pix,
     composite_in_argb_const_mask, OIL_IMPL_FLAG_SSE2);
 #endif
 
-static void
+SSE_FUNCTION static void
 composite_over_argb_sse_2pix (uint32_t *dest, const uint32_t *src, int n)
 {
   for (; n >= 2; n -= 2) {
@@ -251,7 +252,7 @@ composite_over_argb_sse_2pix (uint32_t *dest, const uint32_t *src, int n)
 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_over_argb_sse_2pix, composite_over_argb,
     OIL_IMPL_FLAG_SSE2);
 
-static void
+SSE_FUNCTION static void
 composite_over_argb_const_src_sse_2pix (uint32_t *dest, const uint32_t *src,
     int n)
 {
@@ -281,7 +282,7 @@ composite_over_argb_const_src_sse_2pix (uint32_t *dest, const uint32_t *src,
 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_over_argb_const_src_sse_2pix,
     composite_over_argb_const_src, OIL_IMPL_FLAG_SSE2);
 
-static void
+SSE_FUNCTION static void
 composite_in_over_argb_sse_2pix (uint32_t *dest, const uint32_t *src,
     const uint8_t *mask, int n)
 {
@@ -318,7 +319,7 @@ composite_in_over_argb_sse_2pix (uint32_t *dest, const uint32_t *src,
 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_over_argb_sse_2pix, composite_in_over_argb,
     OIL_IMPL_FLAG_SSE2);
 
-static void
+SSE_FUNCTION static void
 composite_in_over_argb_const_src_sse_2pix (uint32_t *dest, const uint32_t *src,
     const uint8_t *mask, int n)
 {
@@ -357,7 +358,7 @@ composite_in_over_argb_const_src_sse_2pix (uint32_t *dest, const uint32_t *src,
 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_over_argb_const_src_sse_2pix,
     composite_in_over_argb_const_src, OIL_IMPL_FLAG_SSE2);
 
-static void
+SSE_FUNCTION static void
 composite_in_over_argb_const_mask_sse_2pix (uint32_t *dest, const uint32_t *src,
     const uint8_t *mask, int n)
 {
@@ -396,7 +397,7 @@ composite_in_over_argb_const_mask_sse_2pix (uint32_t *dest, const uint32_t *src,
 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_over_argb_const_mask_sse_2pix,
     composite_in_over_argb_const_mask, OIL_IMPL_FLAG_SSE2);
 
-static void
+SSE_FUNCTION static void
 composite_over_u8_sse_2pix (uint8_t *dest, const uint8_t *src, int n)
 {
   /* Initial operations to align the destination pointer */
diff --git a/liboil/sse/composite_sse_4pix.c b/liboil/sse/composite_sse_4pix.c
index 6e03ff9..12f4d2b 100644
--- a/liboil/sse/composite_sse_4pix.c
+++ b/liboil/sse/composite_sse_4pix.c
@@ -32,9 +32,7 @@
 #include <emmintrin.h>
 #include <liboil/liboilcolorspace.h>
 
-#include "sse_wrapper.h"
-
-
+#define SSE_FUNCTION __attribute__((force_align_arg_pointer))
 
 #ifdef ENABLE_BROKEN_IMPLS
 
@@ -67,7 +65,7 @@ static const struct _SSEData {
 /* Shuffles the given value such that the alpha for each pixel appears in each
  * channel of the pixel.
  */
-static inline __m128i
+SSE_FUNCTION static inline __m128i
 argb_A_sse2(__m128i a)
 {
 #if 0
@@ -92,7 +90,7 @@ argb_A_sse2(__m128i a)
 /* Multiplies the unpacked 16-bits-per-channel pixel data in a
  * channel-by-channel by b, and divides the result by 255, with rounding.
  */
-static inline __m128i
+SSE_FUNCTION static inline __m128i
 inner_muldiv_255_sse2(__m128i a, __m128i b)
 {
   __m128i ret;
@@ -106,7 +104,7 @@ inner_muldiv_255_sse2(__m128i a, __m128i b)
   return ret;
 }
 
-static inline __m128i
+SSE_FUNCTION static inline __m128i
 muldiv_255_sse2(__m128i a, __m128i b)
 {
   __m128i alow, blow, ahigh, bhigh, low, high;
@@ -120,25 +118,25 @@ muldiv_255_sse2(__m128i a, __m128i b)
   return _mm_packus_epi16(low, high);
 }
 
-static inline __m128i
+SSE_FUNCTION static inline __m128i
 negate_argb_sse2(__m128i a)
 {
   return _mm_xor_si128(a, MC(16xff));
 }
 
-static inline __m128i
+SSE_FUNCTION static inline __m128i
 load_argb_sse2(const uint32_t *src)
 {
   return _mm_loadu_si128((__m128i *)src);
 }
 
-static inline __m128i
+SSE_FUNCTION static inline __m128i
 set1_argb_sse2(uint32_t src)
 {
   return _mm_set1_epi32(src);
 }
 
-static inline __m128i
+SSE_FUNCTION static inline __m128i
 load_u8_mask(const uint8_t *m)
 {
   __m128i a;
@@ -148,24 +146,25 @@ load_u8_mask(const uint8_t *m)
   return a;
 }
 
-static inline __m128i
+SSE_FUNCTION static inline __m128i
 set1_u8_mask(uint8_t m)
 {
   return _mm_set1_epi8(m);
 }
 
-static void
+SSE_FUNCTION static void
 store_argb_sse2(uint32_t *dest, __m128i pix)
 {
   _mm_store_si128((__m128i *)dest, pix);
 }
 
-static __m128i over_argb_sse2(__m128i dest, __m128i src, __m128i srca)
+SSE_FUNCTION static __m128i 
+over_argb_sse2(__m128i dest, __m128i src, __m128i srca)
 {
   return _mm_adds_epu8(src, muldiv_255_sse2(dest, negate_argb_sse2(srca)));
 }
 
-static void
+SSE_FUNCTION static void
 composite_in_argb_sse (uint32_t *dest, const uint32_t *src, const uint8_t *mask,
     int n)
 {
@@ -202,7 +201,7 @@ composite_in_argb_sse (uint32_t *dest, const uint32_t *src, const uint8_t *mask,
 OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_argb_sse, composite_in_argb,
     OIL_IMPL_FLAG_SSE2);
 
-static void
+SSE_FUNCTION static void
 composite_in_argb_const_src_sse (uint32_t *dest, const uint32_t *src,
     const uint8_t *mask, int n)
 {
@@ -239,7 +238,7 @@ composite_in_argb_const_src_sse (uint32_t *dest, const uint32_t *src,
 OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_argb_const_src_sse,
     composite_in_argb_const_src, OIL_IMPL_FLAG_SSE2);
 
-static void
+SSE_FUNCTION static void
 composite_in_argb_const_mask_sse (uint32_t *dest, const uint32_t *src,
     const uint8_t *mask, int n)
 {
@@ -276,7 +275,7 @@ composite_in_argb_const_mask_sse (uint32_t *dest, const uint32_t *src,
 OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_argb_const_mask_sse,
     composite_in_argb_const_mask, OIL_IMPL_FLAG_SSE2);
 
-static void
+SSE_FUNCTION static void
 composite_over_argb_sse (uint32_t *dest, const uint32_t *src, int n)
 {
   for (; ((long)dest & 15) && (n > 0); n--) {
@@ -311,7 +310,7 @@ composite_over_argb_sse (uint32_t *dest, const uint32_t *src, int n)
 OIL_DEFINE_IMPL_FULL_WRAPPER (composite_over_argb_sse, composite_over_argb,
     OIL_IMPL_FLAG_SSE2);
 
-static void
+SSE_FUNCTION static void
 composite_over_argb_const_src_sse (uint32_t *dest, const uint32_t *src, int n)
 {
   __m128i s, sa;
@@ -348,7 +347,7 @@ composite_over_argb_const_src_sse (uint32_t *dest, const uint32_t *src, int n)
 OIL_DEFINE_IMPL_FULL_WRAPPER (composite_over_argb_const_src_sse,
     composite_over_argb_const_src, OIL_IMPL_FLAG_SSE2);
 
-static void
+SSE_FUNCTION static void
 composite_in_over_argb_sse (uint32_t *dest, const uint32_t *src,
     const uint8_t *mask, int n)
 {
@@ -401,7 +400,7 @@ composite_in_over_argb_sse (uint32_t *dest, const uint32_t *src,
 OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_over_argb_sse, composite_in_over_argb,
     OIL_IMPL_FLAG_SSE2);
 
-static void
+SSE_FUNCTION static void
 composite_in_over_argb_const_src_sse (uint32_t *dest, const uint32_t *src,
     const uint8_t *mask, int n)
 {
@@ -456,7 +455,7 @@ composite_in_over_argb_const_src_sse (uint32_t *dest, const uint32_t *src,
 OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_over_argb_const_src_sse,
     composite_in_over_argb_const_src, OIL_IMPL_FLAG_SSE2);
 
-static void
+SSE_FUNCTION static void
 composite_in_over_argb_const_mask_sse (uint32_t *dest, const uint32_t *src,
     const uint8_t *mask, int n)
 {
@@ -511,7 +510,7 @@ composite_in_over_argb_const_mask_sse (uint32_t *dest, const uint32_t *src,
 OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_over_argb_const_mask_sse,
     composite_in_over_argb_const_mask, OIL_IMPL_FLAG_SSE2);
 
-static void
+SSE_FUNCTION static void
 composite_over_u8_sse (uint8_t *dest, const uint8_t *src, int n)
 {
   /* Initial operations to align the destination pointer */
diff --git a/liboil/sse/copy_sse.c b/liboil/sse/copy_sse.c
index b695bc4..5509eae 100644
--- a/liboil/sse/copy_sse.c
+++ b/liboil/sse/copy_sse.c
@@ -31,7 +31,9 @@
 #include <liboil/liboilfunction.h>
 #include <emmintrin.h>
 
-static void
+#define SSE_FUNCTION __attribute__((force_align_arg_pointer))
+
+SSE_FUNCTION static void
 copy_u8_sse (uint8_t *dest, const uint8_t *src, int n)
 {
   for (; ((long)dest & 15) && (n > 0); n--) {
@@ -48,7 +50,7 @@ copy_u8_sse (uint8_t *dest, const uint8_t *src, int n)
 }
 OIL_DEFINE_IMPL_FULL (copy_u8_sse, copy_u8, OIL_IMPL_FLAG_SSE2);
 
-static void
+SSE_FUNCTION static void
 copy_u8_sse_unroll2 (uint8_t *dest, const uint8_t *src, int n)
 {
   for (; ((long)dest & 15) && (n > 0); n--) {
diff --git a/liboil/sse/math_sse.c b/liboil/sse/math_sse.c
index e5d238d..0b70b42 100644
--- a/liboil/sse/math_sse.c
+++ b/liboil/sse/math_sse.c
@@ -32,7 +32,9 @@
 #include <emmintrin.h>
 #include <xmmintrin.h>
 
-static void
+#define SSE_FUNCTION __attribute__((force_align_arg_pointer))
+
+SSE_FUNCTION static void
 add_f32_sse (float *dest, float *src1, float *src2, int n)
 {
   /* Initial operations to align the destination pointer */
@@ -55,7 +57,7 @@ add_f32_sse (float *dest, float *src1, float *src2, int n)
 }
 OIL_DEFINE_IMPL_FULL (add_f32_sse, add_f32, OIL_IMPL_FLAG_SSE);
 
-static void
+SSE_FUNCTION static void
 add_f64_sse2 (double *dest, double *src1, double *src2, int n)
 {
   __m128d xmm0, xmm1;
@@ -80,7 +82,7 @@ add_f64_sse2 (double *dest, double *src1, double *src2, int n)
 }
 OIL_DEFINE_IMPL_FULL (add_f64_sse2, add_f64, OIL_IMPL_FLAG_SSE2);
 
-static void
+SSE_FUNCTION static void
 add_f64_sse2_unroll (double *dest, double *src1, double *src2, int n)
 {
   __m128d xmm0, xmm1;
@@ -120,7 +122,7 @@ add_f64_sse2_unroll (double *dest, double *src1, double *src2, int n)
 }
 OIL_DEFINE_IMPL_FULL (add_f64_sse2_unroll, add_f64, OIL_IMPL_FLAG_SSE2);
 
-static void
+SSE_FUNCTION static void
 subtract_f32_sse (float *dest, float *src1, float *src2, int n)
 {
   /* Initial operations to align the destination pointer */
@@ -143,7 +145,7 @@ subtract_f32_sse (float *dest, float *src1, float *src2, int n)
 }
 OIL_DEFINE_IMPL_FULL (subtract_f32_sse, subtract_f32, OIL_IMPL_FLAG_SSE);
 
-static void
+SSE_FUNCTION static void
 multiply_f32_sse (float *dest, float *src1, float *src2, int n)
 {
   /* Initial operations to align the destination pointer */
@@ -166,7 +168,7 @@ multiply_f32_sse (float *dest, float *src1, float *src2, int n)
 }
 OIL_DEFINE_IMPL_FULL (multiply_f32_sse, multiply_f32, OIL_IMPL_FLAG_SSE);
 
-static void
+SSE_FUNCTION static void
 divide_f32_sse (float *dest, float *src1, float *src2, int n)
 {
   /* Initial operations to align the destination pointer */
@@ -189,7 +191,7 @@ divide_f32_sse (float *dest, float *src1, float *src2, int n)
 }
 OIL_DEFINE_IMPL_FULL (divide_f32_sse, divide_f32, OIL_IMPL_FLAG_SSE);
 
-static void
+SSE_FUNCTION static void
 minimum_f32_sse (float *dest, float *src1, float *src2, int n)
 {
   /* Initial operations to align the destination pointer */
@@ -216,7 +218,7 @@ minimum_f32_sse (float *dest, float *src1, float *src2, int n)
 }
 OIL_DEFINE_IMPL_FULL (minimum_f32_sse, minimum_f32, OIL_IMPL_FLAG_SSE);
 
-static void
+SSE_FUNCTION static void
 maximum_f32_sse (float *dest, float *src1, float *src2, int n)
 {
   /* Initial operations to align the destination pointer */
@@ -243,7 +245,7 @@ maximum_f32_sse (float *dest, float *src1, float *src2, int n)
 }
 OIL_DEFINE_IMPL_FULL (maximum_f32_sse, maximum_f32, OIL_IMPL_FLAG_SSE);
 
-static void
+SSE_FUNCTION static void
 inverse_f32_sse (float *dest, float *src1, int n)
 {
   /* Initial operations to align the destination pointer */
@@ -268,7 +270,7 @@ inverse_f32_sse (float *dest, float *src1, int n)
 }
 OIL_DEFINE_IMPL_FULL (inverse_f32_sse, inverse_f32, OIL_IMPL_FLAG_SSE);
 
-static void
+SSE_FUNCTION static void
 negative_f32_sse (float *dest, float *src1, int n)
 {
   /* Initial operations to align the destination pointer */
@@ -290,7 +292,7 @@ negative_f32_sse (float *dest, float *src1, int n)
 }
 OIL_DEFINE_IMPL_FULL (negative_f32_sse, negative_f32, OIL_IMPL_FLAG_SSE);
 
-static void
+SSE_FUNCTION static void
 scalaradd_f32_ns_sse (float *dest, float *src1, float *val, int n)
 {
   __m128 xmm1;
@@ -314,7 +316,7 @@ scalaradd_f32_ns_sse (float *dest, float *src1, float *val, int n)
 }
 OIL_DEFINE_IMPL_FULL (scalaradd_f32_ns_sse, scalaradd_f32_ns, OIL_IMPL_FLAG_SSE);
 
-static void
+SSE_FUNCTION static void
 scalarmultiply_f32_ns_sse (float *dest, float *src1, float *val, int n)
 {
   __m128 xmm1;
@@ -338,7 +340,7 @@ scalarmultiply_f32_ns_sse (float *dest, float *src1, float *val, int n)
 }
 OIL_DEFINE_IMPL_FULL (scalarmultiply_f32_ns_sse, scalarmultiply_f32_ns, OIL_IMPL_FLAG_SSE);
 
-static void
+SSE_FUNCTION static void
 scalarmultiply_f64_ns_sse2 (double *dest, double *src1, double *val, int n)
 {
   __m128d xmm1;
diff --git a/liboil/sse/math_sse_unroll2.c b/liboil/sse/math_sse_unroll2.c
index 51dca09..cd4f55f 100644
--- a/liboil/sse/math_sse_unroll2.c
+++ b/liboil/sse/math_sse_unroll2.c
@@ -32,7 +32,9 @@
 #include <emmintrin.h>
 #include <xmmintrin.h>
 
-static void
+#define SSE_FUNCTION __attribute__((force_align_arg_pointer))
+
+SSE_FUNCTION static void
 add_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n)
 {
   /* Initial operations to align the destination pointer */
@@ -59,7 +61,7 @@ add_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n)
 }
 OIL_DEFINE_IMPL_FULL (add_f32_sse_unroll2, add_f32, OIL_IMPL_FLAG_SSE);
 
-static void
+SSE_FUNCTION static void
 subtract_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n)
 {
   /* Initial operations to align the destination pointer */
@@ -86,7 +88,7 @@ subtract_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n)
 }
 OIL_DEFINE_IMPL_FULL (subtract_f32_sse_unroll2, subtract_f32, OIL_IMPL_FLAG_SSE);
 
-static void
+SSE_FUNCTION static void
 multiply_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n)
 {
   /* Initial operations to align the destination pointer */
@@ -113,7 +115,7 @@ multiply_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n)
 }
 OIL_DEFINE_IMPL_FULL (multiply_f32_sse_unroll2, multiply_f32, OIL_IMPL_FLAG_SSE);
 
-static void
+SSE_FUNCTION static void
 divide_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n)
 {
   /* Initial operations to align the destination pointer */
@@ -140,7 +142,7 @@ divide_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n)
 }
 OIL_DEFINE_IMPL_FULL (divide_f32_sse_unroll2, divide_f32, OIL_IMPL_FLAG_SSE);
 
-static void
+SSE_FUNCTION static void
 minimum_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n)
 {
   /* Initial operations to align the destination pointer */
@@ -171,7 +173,7 @@ minimum_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n)
 }
 OIL_DEFINE_IMPL_FULL (minimum_f32_sse_unroll2, minimum_f32, OIL_IMPL_FLAG_SSE);
 
-static void
+SSE_FUNCTION static void
 maximum_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n)
 {
   /* Initial operations to align the destination pointer */
@@ -202,7 +204,7 @@ maximum_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n)
 }
 OIL_DEFINE_IMPL_FULL (maximum_f32_sse_unroll2, maximum_f32, OIL_IMPL_FLAG_SSE);
 
-static void
+SSE_FUNCTION static void
 inverse_f32_sse_unroll2 (float *dest, float *src1, int n)
 {
   /* Initial operations to align the destination pointer */
@@ -231,7 +233,7 @@ inverse_f32_sse_unroll2 (float *dest, float *src1, int n)
 }
 OIL_DEFINE_IMPL_FULL (inverse_f32_sse_unroll2, inverse_f32, OIL_IMPL_FLAG_SSE);
 
-static void
+SSE_FUNCTION static void
 negative_f32_sse_unroll2 (float *dest, float *src1, int n)
 {
   /* Initial operations to align the destination pointer */
@@ -257,7 +259,7 @@ negative_f32_sse_unroll2 (float *dest, float *src1, int n)
 }
 OIL_DEFINE_IMPL_FULL (negative_f32_sse_unroll2, negative_f32, OIL_IMPL_FLAG_SSE);
 
-static void
+SSE_FUNCTION static void
 scalaradd_f32_ns_sse_unroll2 (float *dest, float *src1, float *val, int n)
 {
   __m128 xmm1;
@@ -284,7 +286,7 @@ scalaradd_f32_ns_sse_unroll2 (float *dest, float *src1, float *val, int n)
 }
 OIL_DEFINE_IMPL_FULL (scalaradd_f32_ns_sse_unroll2, scalaradd_f32_ns, OIL_IMPL_FLAG_SSE);
 
-static void
+SSE_FUNCTION static void
 scalarmultiply_f32_ns_sse_unroll2 (float *dest, float *src1, float *val, int n)
 {
   __m128 xmm1;
@@ -311,7 +313,7 @@ scalarmultiply_f32_ns_sse_unroll2 (float *dest, float *src1, float *val, int n)
 }
 OIL_DEFINE_IMPL_FULL (scalarmultiply_f32_ns_sse_unroll2, scalarmultiply_f32_ns, OIL_IMPL_FLAG_SSE);
 
-static void
+SSE_FUNCTION static void
 scalarmultiply_f64_ns_sse2_unroll2 (double *dest, double *src1, double *val, int n)
 {
   __m128d xmm1;
diff --git a/liboil/sse/multsum_sse.c b/liboil/sse/multsum_sse.c
index 37238a5..382dd60 100644
--- a/liboil/sse/multsum_sse.c
+++ b/liboil/sse/multsum_sse.c
@@ -5,6 +5,8 @@
 #include <liboil/liboilfunction.h>
 #include <emmintrin.h>
 
+#define SSE_FUNCTION __attribute__((force_align_arg_pointer))
+
 #define MULTSUM_SSE2_NSTRIDED(i) { \
   t1 = _mm_load_pd(&OIL_GET(src1, i, double)); \
   t2 = _mm_load_pd(&OIL_GET(src2, i, double)); \
@@ -29,7 +31,7 @@
 
 
 #ifdef ENABLE_BROKEN_IMPLS
-static void
+SSE_FUNCTION static void
 multsum_f64_sse2_unroll4(double *dest,
      const double *src1, int sstr1,
      const double *src2, int sstr2,
diff --git a/liboil/sse/sad8x8_sse.c b/liboil/sse/sad8x8_sse.c
index 8795200..3c7615c 100644
--- a/liboil/sse/sad8x8_sse.c
+++ b/liboil/sse/sad8x8_sse.c
@@ -31,7 +31,7 @@
 #include <liboil/liboilfunction.h>
 #include <emmintrin.h>
 
-#include "sse_wrapper.h"
+#define SSE_FUNCTION __attribute__((force_align_arg_pointer))
 
 #ifdef ENABLE_BROKEN_IMPLS
 union m128_int {
@@ -40,7 +40,7 @@ union m128_int {
   uint16_t s[8];
 };
 
-static void
+SSE_FUNCTION static void
 sad8x8_u8_sse (uint32_t *dest, uint8_t *src1, int sstr1, uint8_t *src2,
     int sstr2)
 {
diff --git a/liboil/sse/splat_sse.c b/liboil/sse/splat_sse.c
index 14593a6..a6c0be7 100644
--- a/liboil/sse/splat_sse.c
+++ b/liboil/sse/splat_sse.c
@@ -31,7 +31,9 @@
 #include <liboil/liboilfunction.h>
 #include <emmintrin.h>
 
-static void
+#define SSE_FUNCTION __attribute__((force_align_arg_pointer))
+
+SSE_FUNCTION static void
 splat_u32_ns_sse (uint32_t *dest, const uint32_t *param, int n)
 {
   __m128i v;
@@ -51,7 +53,7 @@ splat_u32_ns_sse (uint32_t *dest, const uint32_t *param, int n)
 }
 OIL_DEFINE_IMPL_FULL (splat_u32_ns_sse, splat_u32_ns, OIL_IMPL_FLAG_SSE2);
 
-static void
+SSE_FUNCTION static void
 splat_u32_ns_sse_unroll2 (uint32_t *dest, const uint32_t *param, int n)
 {
   __m128i v;
@@ -76,7 +78,7 @@ splat_u32_ns_sse_unroll2 (uint32_t *dest, const uint32_t *param, int n)
 }
 OIL_DEFINE_IMPL_FULL (splat_u32_ns_sse_unroll2, splat_u32_ns, OIL_IMPL_FLAG_SSE2);
 
-static void
+SSE_FUNCTION static void
 splat_u8_ns_sse (uint8_t *dest, const uint8_t *param, int n)
 {
   __m128i v;
@@ -96,7 +98,7 @@ splat_u8_ns_sse (uint8_t *dest, const uint8_t *param, int n)
 }
 OIL_DEFINE_IMPL_FULL (splat_u8_ns_sse, splat_u8_ns, OIL_IMPL_FLAG_SSE2);
 
-static void
+SSE_FUNCTION static void
 splat_u8_ns_sse_unroll2 (uint8_t *dest, const uint8_t *param, int n)
 {
   __m128i v;


More information about the Liboil-commit mailing list