[Beignet] [PATCH 05/18] Add the atomic functions into libocl.

Tue Aug 12 00:31:45 PDT 2014

From: Junyan He <junyan.he at linux.intel.com>

Signed-off-by: Junyan He <junyan.he at linux.intel.com>
---
 backend/src/libocl/Makefile.in           |  15 +-
 backend/src/libocl/include/ocl_atom.h    |  84 ++++
 backend/src/libocl/lib/ocl_atom.cl       | 122 ++++++
 backend/src/libocl/script/gen_convert.sh | 653 +++++++++++++++++++++++++++++++
 4 files changed, 871 insertions(+), 3 deletions(-)
 create mode 100644 backend/src/libocl/include/ocl_atom.h
 create mode 100644 backend/src/libocl/lib/ocl_atom.cl
 create mode 100755 backend/src/libocl/script/gen_convert.sh

diff --git a/backend/src/libocl/Makefile.in b/backend/src/libocl/Makefile.in
index 0264391..06adc08 100644
--- a/backend/src/libocl/Makefile.in
+++ b/backend/src/libocl/Makefile.in
@@ -3,11 +3,11 @@
 HEADER_INSTALL_PREFIX=@OCL_HEADER_DIR@
 BITCODE_INSTALL_PREFIX=@OCL_BITCODE_DIR@
 
-GENERATED_FILES=ocl_as.cl
-GENERATED_HEADERS=ocl_defines.h ocl_as.h
+GENERATED_FILES=ocl_as.cl ocl_convert.cl
+GENERATED_HEADERS=ocl_defines.h ocl_as.h ocl_convert.h
 GENERATED_CL_SRCS=$(addprefix lib/, $(GENERATED_FILES))
 GENERATED_CL_HEADERS=$(addprefix include/, $(GENERATED_HEADERS))
-CL_FILE_NAMES=ocl_workitem.cl $(GENERATED_FILES)
+CL_FILE_NAMES=ocl_workitem.cl ocl_atom.cl $(GENERATED_FILES)
 LL_FILE_NAMES=
 CL_SRCS=$(addprefix lib/, $(CL_FILE_NAMES))
 LL_SRCS=$(addprefix lib/, $(LL_FILE_NAMES))
@@ -30,6 +30,15 @@ include/ocl_as.h:script/gen_common.inc
 	@echo "Generate the header:  $@"
 	@script/gen_as.sh -p > $@
 
+
+lib/ocl_convert.cl:script/gen_common.inc
+	@echo "Generate the source:  $@"
+	@script/gen_convert.sh > $@
+
+include/ocl_convert.h:script/gen_common.inc
+	@echo "Generate the header:  $@"
+	@script/gen_convert.sh -p > $@
+
 include/ocl_defines.h:include/ocl_defines.inh
 	@echo "Generate the header:  $@"
 	@rm -f $@
diff --git a/backend/src/libocl/include/ocl_atom.h b/backend/src/libocl/include/ocl_atom.h
new file mode 100644
index 0000000..b2cfcbf
--- /dev/null
+++ b/backend/src/libocl/include/ocl_atom.h
@@ -0,0 +1,84 @@
+#ifndef __OCL_ATOM_H__
+#define __OCL_ATOM_H__
+#include "ocl_types.h"
+
+/////////////////////////////////////////////////////////////////////////////
+// Atomic functions
+/////////////////////////////////////////////////////////////////////////////
+
+OVERLOADABLE uint atomic_add(volatile __global uint *p, uint val);
+OVERLOADABLE uint atomic_add(volatile __local uint *p, uint val);
+OVERLOADABLE int atomic_add(volatile __global int *p, int val);
+OVERLOADABLE int atomic_add(volatile __local int *p, int val);
+
+OVERLOADABLE uint atomic_sub(volatile __global uint *p, uint val);
+OVERLOADABLE uint atomic_sub(volatile __local uint *p, uint val);
+OVERLOADABLE int atomic_sub(volatile __global int *p, int val);
+OVERLOADABLE int atomic_sub(volatile __local int *p, int val);
+
+OVERLOADABLE uint atomic_and(volatile __global uint *p, uint val);
+OVERLOADABLE uint atomic_and(volatile __local uint *p, uint val);
+OVERLOADABLE int atomic_and(volatile __global int *p, int val);
+OVERLOADABLE int atomic_and(volatile __local int *p, int val);
+
+OVERLOADABLE uint atomic_or(volatile __global uint *p, uint val);
+OVERLOADABLE uint atomic_or(volatile __local uint *p, uint val);
+OVERLOADABLE int atomic_or(volatile __global int *p, int val);
+OVERLOADABLE int atomic_or(volatile __local int *p, int val);
+
+OVERLOADABLE uint atomic_xor(volatile __global uint *p, uint val);
+OVERLOADABLE uint atomic_xor(volatile __local uint *p, uint val);
+OVERLOADABLE int atomic_xor(volatile __global int *p, int val);
+OVERLOADABLE int atomic_xor(volatile __local int *p, int val);
+
+OVERLOADABLE uint atomic_xchg(volatile __global uint *p, uint val);
+OVERLOADABLE uint atomic_xchg(volatile __local uint *p, uint val);
+OVERLOADABLE int atomic_xchg(volatile __global int *p, int val);
+OVERLOADABLE int atomic_xchg(volatile __local int *p, int val);
+
+OVERLOADABLE int atomic_min(volatile __global int *p, int val);
+OVERLOADABLE int atomic_min(volatile __local int *p, int val);
+
+OVERLOADABLE int atomic_max(volatile __global int *p, int val);
+OVERLOADABLE int atomic_max(volatile __local int *p, int val);
+
+OVERLOADABLE uint atomic_min(volatile __global uint *p, uint val);
+OVERLOADABLE uint atomic_min(volatile __local uint *p, uint val);
+
+OVERLOADABLE uint atomic_max(volatile __global uint *p, uint val);
+OVERLOADABLE uint atomic_max(volatile __local uint *p, uint val);
+
+OVERLOADABLE float atomic_xchg (volatile __global float *p, float val);
+OVERLOADABLE float atomic_xchg (volatile __local float *p, float val);
+
+OVERLOADABLE uint atomic_inc (volatile __global uint *p);
+OVERLOADABLE uint atomic_inc (volatile __local uint *p);
+OVERLOADABLE int atomic_inc (volatile __global int *p);
+OVERLOADABLE int atomic_inc (volatile __local int *p);
+
+OVERLOADABLE uint atomic_dec (volatile __global uint *p);
+OVERLOADABLE uint atomic_dec (volatile __local uint *p);
+OVERLOADABLE int atomic_dec (volatile __global int *p);
+OVERLOADABLE int atomic_dec (volatile __local int *p);
+
+OVERLOADABLE uint atomic_cmpxchg (volatile __global uint *p, uint cmp, uint val);
+OVERLOADABLE uint atomic_cmpxchg (volatile __local uint *p, uint cmp, uint val);
+OVERLOADABLE int atomic_cmpxchg (volatile __global int *p, int cmp, int val);
+OVERLOADABLE int atomic_cmpxchg (volatile __local int *p, int cmp, int val);
+
+
+// XXX for conformance test
+// The following atom_xxx api is on OpenCL spec 1.0.
+#define atom_sub atomic_sub
+#define atom_and atomic_and
+#define atom_or atomic_or
+#define atom_xor atomic_xor
+#define atom_xchg atomic_xchg
+#define atom_min atomic_min
+#define atom_max atomic_max
+#define atom_inc atomic_inc
+#define atom_dec atomic_dec
+#define atom_cmpxchg atomic_cmpxchg
+
+
+#endif  /* __OCL_ATOM_H__ */
diff --git a/backend/src/libocl/lib/ocl_atom.cl b/backend/src/libocl/lib/ocl_atom.cl
new file mode 100644
index 0000000..ad09d9e
--- /dev/null
+++ b/backend/src/libocl/lib/ocl_atom.cl
@@ -0,0 +1,122 @@
+#include "ocl_atom.h"
+#include "ocl_as.h"
+
+OVERLOADABLE uint __gen_ocl_atomic_add(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_add(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_sub(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_sub(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_and(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_and(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_or(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_or(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_xor(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_xor(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_xchg(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_xchg(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_inc(__global uint *p);
+OVERLOADABLE uint __gen_ocl_atomic_inc(__local uint *p);
+OVERLOADABLE uint __gen_ocl_atomic_dec(__global uint *p);
+OVERLOADABLE uint __gen_ocl_atomic_dec(__local uint *p);
+OVERLOADABLE uint __gen_ocl_atomic_cmpxchg(__global uint *p, uint cmp, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_cmpxchg(__local uint *p, uint cmp, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_imin(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_imin(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_imax(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_imax(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_umin(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_umin(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_umax(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_umax(__local uint *p, uint val);
+
+#define DECL_ATOMIC_OP_SPACE(NAME, TYPE, SPACE, PREFIX)                        \
+  OVERLOADABLE TYPE atomic_##NAME (volatile SPACE TYPE *p, TYPE val) { \
+    return (TYPE)__gen_ocl_##PREFIX##NAME((SPACE uint *)p, val);            \
+  }
+
+#define DECL_ATOMIC_OP_TYPE(NAME, TYPE, PREFIX) \
+  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __global, PREFIX) \
+  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __local, PREFIX)
+
+#define DECL_ATOMIC_OP(NAME) \
+  DECL_ATOMIC_OP_TYPE(NAME, uint, atomic_)        \
+  DECL_ATOMIC_OP_TYPE(NAME, int, atomic_)
+
+DECL_ATOMIC_OP(add)
+DECL_ATOMIC_OP(sub)
+DECL_ATOMIC_OP(and)
+DECL_ATOMIC_OP(or)
+DECL_ATOMIC_OP(xor)
+DECL_ATOMIC_OP(xchg)
+DECL_ATOMIC_OP_TYPE(min, int, atomic_i)
+DECL_ATOMIC_OP_TYPE(max, int, atomic_i)
+DECL_ATOMIC_OP_TYPE(min, uint, atomic_u)
+DECL_ATOMIC_OP_TYPE(max, uint, atomic_u)
+
+#undef DECL_ATOMIC_OP_SPACE
+
+#define DECL_ATOMIC_OP_SPACE(NAME, TYPE, SPACE, PREFIX)                        \
+  OVERLOADABLE TYPE atomic_##NAME (volatile SPACE TYPE *p, TYPE val) { \
+    return as_float(__gen_ocl_##PREFIX##NAME((SPACE uint *)p, as_uint(val))); \
+  }
+DECL_ATOMIC_OP_SPACE(xchg, float, __global, atomic_)
+DECL_ATOMIC_OP_SPACE(xchg, float, __local, atomic_)
+
+#undef DECL_ATOMIC_OP
+#undef DECL_ATOMIC_OP_TYPE
+#undef DECL_ATOMIC_OP_SPACE
+
+#define DECL_ATOMIC_OP_SPACE(NAME, TYPE, SPACE) \
+  OVERLOADABLE TYPE atomic_##NAME (volatile SPACE TYPE *p) { \
+    return (TYPE)__gen_ocl_atomic_##NAME((SPACE uint *)p); \
+  }
+
+#define DECL_ATOMIC_OP_TYPE(NAME, TYPE) \
+  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __global) \
+  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __local)
+
+#define DECL_ATOMIC_OP(NAME) \
+  DECL_ATOMIC_OP_TYPE(NAME, uint) \
+  DECL_ATOMIC_OP_TYPE(NAME, int)
+
+DECL_ATOMIC_OP(inc)
+DECL_ATOMIC_OP(dec)
+
+#undef DECL_ATOMIC_OP
+#undef DECL_ATOMIC_OP_TYPE
+#undef DECL_ATOMIC_OP_SPACE
+
+#define DECL_ATOMIC_OP_SPACE(NAME, TYPE, SPACE)  \
+  OVERLOADABLE TYPE atomic_##NAME (volatile SPACE TYPE *p, TYPE cmp, TYPE val) { \
+    return (TYPE)__gen_ocl_atomic_##NAME((SPACE uint *)p, (uint)cmp, (uint)val); \
+  }
+
+#define DECL_ATOMIC_OP_TYPE(NAME, TYPE) \
+  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __global) \
+  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __local)
+
+#define DECL_ATOMIC_OP(NAME) \
+  DECL_ATOMIC_OP_TYPE(NAME, uint) \
+  DECL_ATOMIC_OP_TYPE(NAME, int)
+
+DECL_ATOMIC_OP(cmpxchg)
+
+#undef DECL_ATOMIC_OP
+#undef DECL_ATOMIC_OP_TYPE
+#undef DECL_ATOMIC_OP_SPACE
+
+// XXX for conformance test
+// The following atom_xxx api is on OpenCL spec 1.0.
+// But the conformance test suite will test them anyway.
+#define atom_add atomic_add
+#define atom_sub atomic_sub
+#define atom_and atomic_and
+#define atom_or atomic_or
+#define atom_xor atomic_xor
+#define atom_xchg atomic_xchg
+#define atom_min atomic_min
+#define atom_max atomic_max
+#define atom_inc atomic_inc
+#define atom_dec atomic_dec
+#define atom_cmpxchg atomic_cmpxchg
+
+
diff --git a/backend/src/libocl/script/gen_convert.sh b/backend/src/libocl/script/gen_convert.sh
new file mode 100755
index 0000000..a6d1a06
--- /dev/null
+++ b/backend/src/libocl/script/gen_convert.sh
@@ -0,0 +1,653 @@
+#! /bin/sh -e
+
+if [ $1"a" = "-pa" ]; then
+    echo "#ifndef __OCL_CONVERT_H__"
+    echo "#define __OCL_CONVERT_H__"
+    echo "#include \"ocl_types.h\""
+    echo
+else
+    echo "#include \"ocl_convert.h\""
+    echo
+fi
+
+#should be called at parent dir
+. ./script/gen_common.inc
+
+# For all vector lengths and types, generate conversion functions
+for vector_length in $VECTOR_LENGTHS; do
+    if test $vector_length -eq 1; then
+	for ftype in $TYPES; do
+	    fbasetype=`IFS=:; set -- dummy $ftype; echo $2`
+	    for ttype in $TYPES; do
+		tbasetype=`IFS=:; set -- dummy $ttype; echo $2`
+		if [ $1"a" = "-pa" ]; then
+		    echo "OVERLOADABLE $tbasetype convert_$tbasetype($fbasetype v);"
+		else
+		    echo "OVERLOADABLE $tbasetype convert_$tbasetype($fbasetype v) {"
+		    echo "  return ($tbasetype)v;"
+		    echo "}"
+		    echo
+		fi
+	    done
+	done
+    else
+	for ftype in $TYPES; do
+	    fbasetype=`IFS=:; set -- dummy $ftype; echo $2`
+	    for ttype in $TYPES; do
+		tbasetype=`IFS=:; set -- dummy $ttype; echo $2`
+		if test $fbasetype = $tbasetype; then
+		    if test $vector_length -gt 1; then
+			fvectortype=$fbasetype$vector_length
+			tvectortype=$tbasetype$vector_length
+			if [ $1"a" = "-pa" ]; then
+			    echo "OVERLOADABLE $tvectortype convert_$tvectortype($fvectortype v);"
+			else
+			    echo "OVERLOADABLE $tvectortype convert_$tvectortype($fvectortype v) { return v; }"
+			fi
+		    else
+			if [ $1"a" = "-pa" ]; then
+			    echo "OVERLOADABLE $tbasetype convert_$tbasetype($fbasetype v);"
+			else
+			    echo "OVERLOADABLE $tbasetype convert_$tbasetype($fbasetype v) { return v; }"
+			fi
+		    fi
+		    continue
+		fi
+		fvectortype=$fbasetype$vector_length
+		tvectortype=$tbasetype$vector_length
+		construct="($tbasetype)(v.s0)"
+		if test $vector_length -gt 1; then
+		    construct="$construct, ($tbasetype)(v.s1)"
+		fi
+		if test $vector_length -gt 2; then
+		    construct="$construct, ($tbasetype)(v.s2)"
+		fi
+		if test $vector_length -gt 3; then
+		    construct="$construct, ($tbasetype)(v.s3)"
+		fi
+		if test $vector_length -gt 4; then
+		    construct="$construct, ($tbasetype)(v.s4)"
+		    construct="$construct, ($tbasetype)(v.s5)"
+		    construct="$construct, ($tbasetype)(v.s6)"
+		    construct="$construct, ($tbasetype)(v.s7)"
+		fi
+		if test $vector_length -gt 8; then
+		    construct="$construct, ($tbasetype)(v.s8)"
+		    construct="$construct, ($tbasetype)(v.s9)"
+		    construct="$construct, ($tbasetype)(v.sA)"
+		    construct="$construct, ($tbasetype)(v.sB)"
+		    construct="$construct, ($tbasetype)(v.sC)"
+		    construct="$construct, ($tbasetype)(v.sD)"
+		    construct="$construct, ($tbasetype)(v.sE)"
+		    construct="$construct, ($tbasetype)(v.sF)"
+		fi
+
+		if [ $1"a" = "-pa" ]; then
+		    echo "OVERLOADABLE $tvectortype convert_$tvectortype($fvectortype v);"
+		else
+		    echo "OVERLOADABLE $tvectortype convert_$tvectortype($fvectortype v) {"
+		    echo "  return ($tvectortype)($construct);"
+		    echo "}"
+		    echo
+		fi
+	    done
+	done
+    fi
+done
+
+echo '
+#define DEF(DSTTYPE, SRCTYPE) \
+OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x);
+DEF(char, uchar);
+DEF(char, short);
+DEF(char, ushort);
+DEF(char, int);
+DEF(char, uint);
+DEF(char, float);
+DEF(uchar, char);
+DEF(uchar, short);
+DEF(uchar, ushort);
+DEF(uchar, int);
+DEF(uchar, uint);
+DEF(uchar, float);
+DEF(short, ushort);
+DEF(short, int);
+DEF(short, uint);
+DEF(short, float);
+DEF(ushort, short);
+DEF(ushort, int);
+DEF(ushort, uint);
+DEF(ushort, float);
+DEF(int, uint);
+DEF(int, float);
+DEF(uint, int);
+DEF(uint, float);
+#undef DEF
+'
+
+if [ $1"a" = "-pa" ]; then
+    echo "#define DEF(DSTTYPE, SRCTYPE, MIN, MAX)  OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x);"
+else
+    echo '
+#define DEF(DSTTYPE, SRCTYPE, MIN, MAX) \
+OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x) { \
+  return x >= MAX ? (DSTTYPE)MAX : x <= MIN ? (DSTTYPE)MIN : x; \
+}
+'
+fi
+
+echo '
+DEF(char, long, -128, 127);
+DEF(uchar, long, 0, 255);
+DEF(short, long, -32768, 32767);
+DEF(ushort, long, 0, 65535);
+DEF(int, long, -0x7fffffff-1, 0x7fffffff);
+DEF(uint, long, 0, 0xffffffffu);
+DEF(long, float, -9.223372036854776e+18f, 9.223372036854776e+18f);
+DEF(ulong, float, 0, 1.8446744073709552e+19f);
+#undef DEF
+'
+
+if [ $1"a" = "-pa" ]; then
+    echo "#define DEF(DSTTYPE, SRCTYPE, MAX) OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x);"
+else
+    echo '
+#define DEF(DSTTYPE, SRCTYPE, MAX) \
+OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x) { \
+  return x >= MAX ? (DSTTYPE)MAX : x; \
+}
+'
+fi
+
+echo '
+DEF(char, ulong, 127);
+DEF(uchar, ulong, 255);
+DEF(short, ulong, 32767);
+DEF(ushort, ulong, 65535);
+DEF(int, ulong, 0x7fffffff);
+DEF(uint, ulong, 0xffffffffu);
+#undef DEF
+'
+
+if [ $1"a" = "-pa" ]; then
+    echo  "OVERLOADABLE long convert_long_sat(ulong x);"
+else
+    echo '
+OVERLOADABLE long convert_long_sat(ulong x) {
+  ulong MAX = 0x7ffffffffffffffful;
+  return x >= MAX ? MAX : x;
+}
+'
+fi
+
+if [ $1"a" = "-pa" ]; then
+    echo "#define DEF(DSTTYPE, SRCTYPE) OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x);"
+else
+    echo '
+#define DEF(DSTTYPE, SRCTYPE) \
+  OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x) { \
+  return x <= 0 ? 0 : x; \
+}
+'
+fi
+
+echo '
+  DEF(ushort, char);
+  DEF(uint, char);
+  DEF(uint, short);
+  DEF(ulong, char);
+  DEF(ulong, short);
+  DEF(ulong, int);
+  DEF(ulong, long);
+  #undef DEF
+'
+
+if [ $1"a" = "-pa" ]; then
+    echo "#define DEF(DSTTYPE, SRCTYPE) OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x);"
+else
+    echo '
+#define DEF(DSTTYPE, SRCTYPE) \
+  OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x) { \
+  return x; \
+}
+'
+fi
+  
+echo '
+DEF(char, char);
+DEF(uchar, uchar);
+DEF(short, char);
+DEF(short, uchar);
+DEF(short, short);
+DEF(ushort, uchar);
+DEF(ushort, ushort);
+DEF(int, char);
+DEF(int, uchar);
+DEF(int, short);
+DEF(int, ushort);
+DEF(int, int);
+DEF(uint, uchar);
+DEF(uint, ushort);
+DEF(uint, uint);
+DEF(long, char);
+DEF(long, uchar);
+DEF(long, short);
+DEF(long, ushort);
+DEF(long, int);
+DEF(long, uint);
+DEF(long, long);
+DEF(ulong, uchar);
+DEF(ulong, ushort);
+DEF(ulong, uint);
+DEF(ulong, ulong);
+#undef DEF
+'
+
+# vector convert_DSTTYPE_sat function
+for vector_length in $VECTOR_LENGTHS; do
+    if test $vector_length -eq 1; then continue; fi
+
+    for ftype in $TYPES; do
+	fbasetype=`IFS=:; set -- dummy $ftype; echo $2`
+	if test $fbasetype = "double"; then continue; fi
+
+	for ttype in $TYPES; do
+	    tbasetype=`IFS=:; set -- dummy $ttype; echo $2`
+	    if test $tbasetype = "double" -o $tbasetype = "float"; then continue; fi
+
+	    fvectortype=$fbasetype$vector_length
+	    tvectortype=$tbasetype$vector_length
+	    conv="convert_${tbasetype}_sat"
+
+	    construct="$conv(v.s0)"
+	    if test $vector_length -gt 1; then
+		construct="$construct, $conv(v.s1)"
+	    fi
+	    if test $vector_length -gt 2; then
+		construct="$construct, $conv(v.s2)"
+	    fi
+	    if test $vector_length -gt 3; then
+		construct="$construct, $conv(v.s3)"
+	    fi
+	    if test $vector_length -gt 4; then
+		construct="$construct, $conv(v.s4)"
+		construct="$construct, $conv(v.s5)"
+		construct="$construct, $conv(v.s6)"
+		construct="$construct, $conv(v.s7)"
+	    fi
+	    if test $vector_length -gt 8; then
+		construct="$construct, $conv(v.s8)"
+		construct="$construct, $conv(v.s9)"
+		construct="$construct, $conv(v.sA)"
+		construct="$construct, $conv(v.sB)"
+		construct="$construct, $conv(v.sC)"
+		construct="$construct, $conv(v.sD)"
+		construct="$construct, $conv(v.sE)"
+		construct="$construct, $conv(v.sF)"
+	    fi
+
+	    if [ $1"a" = "-pa" ]; then
+		echo "OVERLOADABLE $tvectortype convert_${tvectortype}_sat($fvectortype v);"
+	    else
+		echo "OVERLOADABLE $tvectortype convert_${tvectortype}_sat($fvectortype v) {"
+		echo "  return ($tvectortype)($construct);"
+		echo "}"
+		echo
+	    fi
+	done
+    done
+done
+
+if [ $1"a" != "-pa" ]; then
+echo '
+float __gen_ocl_rndz(float x);
+float __gen_ocl_rnde(float x);
+float __gen_ocl_rndu(float x);
+float __gen_ocl_rndd(float x);
+OVERLOADABLE float __convert_float_rtz(long x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  long l = u.f;
+  if((l > x && x > 0) || x >= 0x7fffffc000000000 ||
+    (l < x && x < 0)) {
+    u.u -= 1;
+  }
+  return u.f;
+}
+OVERLOADABLE float __convert_float_rtp(long x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  long l = u.f;  //can not use u.f < x
+  if(l < x && x < 0x7fffffc000000000) {
+    if(x > 0)
+      u.u = u.u + 1;
+    else
+      u.u = u.u - 1;
+  }
+  return u.f;
+}
+OVERLOADABLE float __convert_float_rtn(long x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  long l = u.f;  //avoid overflow
+  if(l > x || x >= 0x7fffffc000000000) {
+    if(x > 0)
+      u.u = u.u - 1;
+    else
+      u.u = u.u + 1;
+  }
+  return u.f;
+}
+OVERLOADABLE float __convert_float_rtz(ulong x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  ulong l = u.f;
+  if(l > x  || x >= 0xffffff8000000000)
+    u.u -= 1;
+  return u.f;
+}
+OVERLOADABLE float __convert_float_rtp(ulong x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  ulong l = u.f;  //can not use u.f < x
+  if(l < x && x < 0xffffff8000000000)
+    u.u = u.u + 1;
+  return u.f;
+}
+OVERLOADABLE float __convert_float_rtn(ulong x)
+{
+  return __convert_float_rtz(x);
+}
+OVERLOADABLE float __convert_float_rtz(int x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  long i = u.f;
+  if((i > x && x > 0) ||
+    (i < x && x < 0)) {
+    u.u -= 1;
+  }
+  return u.f;
+}
+OVERLOADABLE float __convert_float_rtp(int x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  int i = u.f;
+  if(i < x) {
+    if(x > 0)
+      u.u += 1;
+    else
+      u.u -= 1;
+  }
+  return u.f;
+}
+OVERLOADABLE float __convert_float_rtn(int x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  long i = u.f;  //avoid overflow
+  if(i > x) {
+    if(x > 0)
+      u.u = u.u - 1;
+    else
+      u.u = u.u + 1;
+  }
+  return u.f;
+}
+OVERLOADABLE float __convert_float_rtz(uint x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  ulong i = u.f;
+  if(i > x)
+    u.u -= 1;
+  return u.f;
+}
+OVERLOADABLE float __convert_float_rtp(uint x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  uint i = u.f;
+  if(i < x)
+    u.u += 1;
+  return u.f;
+}
+OVERLOADABLE float __convert_float_rtn(uint x)
+{
+    return __convert_float_rtz(x);
+}
+'
+fi
+
+# convert_DSTTYPE_ROUNDING function
+for vector_length in $VECTOR_LENGTHS; do
+    for ftype in $TYPES; do
+	fbasetype=`IFS=:; set -- dummy $ftype; echo $2`
+	if test $fbasetype = "double"; then continue; fi
+
+	for ttype in $TYPES; do
+	    tbasetype=`IFS=:; set -- dummy $ttype; echo $2`
+	    if test $tbasetype = "double"; then continue; fi
+
+	    if test $vector_length -eq 1; then
+		if [ $1"a" = "-pa" ]; then
+		    echo "OVERLOADABLE $tbasetype convert_${tbasetype}_rte($fbasetype x);"
+		    echo "OVERLOADABLE $tbasetype convert_${tbasetype}_rtz($fbasetype x);"
+		    echo "OVERLOADABLE $tbasetype convert_${tbasetype}_rtp($fbasetype x);"
+		    echo "OVERLOADABLE $tbasetype convert_${tbasetype}_rtn($fbasetype x);"
+		else
+		    echo "OVERLOADABLE $tbasetype convert_${tbasetype}_rte($fbasetype x)"
+		    if test $fbasetype = "float" -a $tbasetype != "float"; then
+			echo "{ return __gen_ocl_rnde(x); }"
+		    else
+			echo "{ return x; }"
+		    fi
+
+		    echo "OVERLOADABLE $tbasetype convert_${tbasetype}_rtz($fbasetype x)"
+		    if test $fbasetype = "float" -a $tbasetype != "float"; then
+			echo "{ return __gen_ocl_rndz(x); }"
+		    elif [ "$fbasetype" = "int" -o "$fbasetype" = "uint" -o "$fbasetype" = "long" -o "$fbasetype" = "ulong" ] && [ "$tbasetype" = "float" ]; then
+			echo "{ return __convert_${tbasetype}_rtz(x); }"
+		    else
+			echo "{ return x; }"
+		    fi
+
+		    echo "OVERLOADABLE $tbasetype convert_${tbasetype}_rtp($fbasetype x)"
+		    if test $fbasetype = "float" -a $tbasetype != "float"; then
+			echo "{ return __gen_ocl_rndu(x); }"
+		    elif [ "$fbasetype" = "int" -o "$fbasetype" = "uint" -o "$fbasetype" = "long" -o "$fbasetype" = "ulong" ] && [ "$tbasetype" = "float" ]; then
+			echo "{ return __convert_${tbasetype}_rtp(x); }"
+		    else
+			echo "{ return x; }"
+		    fi
+
+		    echo "OVERLOADABLE $tbasetype convert_${tbasetype}_rtn($fbasetype x)"
+		    if test $fbasetype = "float" -a $tbasetype != "float"; then
+			echo "{ return __gen_ocl_rndd(x); }"
+		    elif [ "$fbasetype" = "int" -o "$fbasetype" = "uint" -o "$fbasetype" = "long" -o "$fbasetype" = "ulong" ] && [ "$tbasetype" = "float" ]; then
+			echo "{ return __convert_${tbasetype}_rtn(x); }"
+		    else
+			echo "{ return x; }"
+		    fi
+		fi
+
+		continue
+	    fi
+
+	    for rounding in $ROUNDING_MODES; do
+		fvectortype=$fbasetype$vector_length
+		tvectortype=$tbasetype$vector_length
+		conv="convert_${tbasetype}_${rounding}"
+
+		construct="$conv(v.s0)"
+		if test $vector_length -gt 1; then
+		    construct="$construct, $conv(v.s1)"
+		fi
+		if test $vector_length -gt 2; then
+		    construct="$construct, $conv(v.s2)"
+		fi
+		if test $vector_length -gt 3; then
+		    construct="$construct, $conv(v.s3)"
+		fi
+		if test $vector_length -gt 4; then
+		    construct="$construct, $conv(v.s4)"
+		    construct="$construct, $conv(v.s5)"
+		    construct="$construct, $conv(v.s6)"
+		    construct="$construct, $conv(v.s7)"
+		fi
+		if test $vector_length -gt 8; then
+		    construct="$construct, $conv(v.s8)"
+		    construct="$construct, $conv(v.s9)"
+		    construct="$construct, $conv(v.sA)"
+		    construct="$construct, $conv(v.sB)"
+		    construct="$construct, $conv(v.sC)"
+		    construct="$construct, $conv(v.sD)"
+		    construct="$construct, $conv(v.sE)"
+		    construct="$construct, $conv(v.sF)"
+		fi
+
+		if [ $1"a" = "-pa" ]; then
+		    echo "OVERLOADABLE $tvectortype convert_${tvectortype}_${rounding}($fvectortype v);"
+		else
+		    echo "OVERLOADABLE $tvectortype convert_${tvectortype}_${rounding}($fvectortype v) {"
+		    echo "  return ($tvectortype)($construct);"
+		    echo "}"
+		    echo
+		fi
+	    done
+	done
+    done
+done
+
+# convert_DSTTYPE_sat_ROUNDING function
+for vector_length in $VECTOR_LENGTHS; do
+    for ftype in $TYPES; do
+	fbasetype=`IFS=:; set -- dummy $ftype; echo $2`
+	if test $fbasetype = "double"; then continue; fi
+
+	for ttype in $TYPES; do
+	    tbasetype=`IFS=:; set -- dummy $ttype; echo $2`
+	    if test $tbasetype = "double" -o $tbasetype = "float"; then continue; fi
+
+	    if test $vector_length -eq 1; then
+		if [ $1"a" = "-pa" ]; then
+		    echo "OVERLOADABLE $tbasetype convert_${tbasetype}_sat_rte($fbasetype x);"
+		    echo "OVERLOADABLE $tbasetype convert_${tbasetype}_sat_rtz($fbasetype x);"
+		    echo "OVERLOADABLE $tbasetype convert_${tbasetype}_sat_rtp($fbasetype x);"
+		    echo "OVERLOADABLE $tbasetype convert_${tbasetype}_sat_rtn($fbasetype x);"
+		else
+		    echo "OVERLOADABLE $tbasetype convert_${tbasetype}_sat_rte($fbasetype x)"
+		    if test $fbasetype = "float"; then
+			echo "{ return convert_${tbasetype}_sat(__gen_ocl_rnde(x)); }"
+		    else
+			echo "{ return convert_${tbasetype}_sat(x); }"
+		    fi
+
+		    echo "OVERLOADABLE $tbasetype convert_${tbasetype}_sat_rtz($fbasetype x)"
+		    if test $fbasetype = "float"; then
+			echo "{ return convert_${tbasetype}_sat(__gen_ocl_rndz(x)); }"
+		    else
+			echo "{ return convert_${tbasetype}_sat(x); }"
+		    fi
+
+		    echo "OVERLOADABLE $tbasetype convert_${tbasetype}_sat_rtp($fbasetype x)"
+		    if test $fbasetype = "float"; then
+			echo "{ return convert_${tbasetype}_sat(__gen_ocl_rndu(x)); }"
+		    else
+			echo "{ return convert_${tbasetype}_sat(x); }"
+		    fi
+
+		    echo "OVERLOADABLE $tbasetype convert_${tbasetype}_sat_rtn($fbasetype x)"
+		    if test $fbasetype = "float"; then
+			echo "{ return convert_${tbasetype}_sat(__gen_ocl_rndd(x)); }"
+		    else
+			echo "{ return convert_${tbasetype}_sat(x); }"
+		    fi
+		fi
+		continue
+	    fi
+
+	    for rounding in $ROUNDING_MODES; do
+		fvectortype=$fbasetype$vector_length
+		tvectortype=$tbasetype$vector_length
+		conv="convert_${tbasetype}_sat_${rounding}"
+
+		construct="$conv(v.s0)"
+		if test $vector_length -gt 1; then
+		    construct="$construct, $conv(v.s1)"
+		fi
+		if test $vector_length -gt 2; then
+		    construct="$construct, $conv(v.s2)"
+		fi
+		if test $vector_length -gt 3; then
+		    construct="$construct, $conv(v.s3)"
+		fi
+		if test $vector_length -gt 4; then
+		    construct="$construct, $conv(v.s4)"
+		    construct="$construct, $conv(v.s5)"
+		    construct="$construct, $conv(v.s6)"
+		    construct="$construct, $conv(v.s7)"
+		fi
+		if test $vector_length -gt 8; then
+		    construct="$construct, $conv(v.s8)"
+		    construct="$construct, $conv(v.s9)"
+		    construct="$construct, $conv(v.sA)"
+		    construct="$construct, $conv(v.sB)"
+		    construct="$construct, $conv(v.sC)"
+		    construct="$construct, $conv(v.sD)"
+		    construct="$construct, $conv(v.sE)"
+		    construct="$construct, $conv(v.sF)"
+		fi
+
+		if [ $1"a" = "-pa" ]; then
+		    echo "OVERLOADABLE $tvectortype convert_${tvectortype}_sat_${rounding}($fvectortype v);"
+		else
+		    echo "OVERLOADABLE $tvectortype convert_${tvectortype}_sat_${rounding}($fvectortype v) {"
+		    echo "  return ($tvectortype)($construct);"
+		    echo "}"
+		    echo
+		fi
+	    done
+	done
+    done
+done
+
+if [ $1"a" = "-pa" ]; then
+    echo "#endif /* __OCL_CONVERT_H__ */"
+fi
-- 
1.8.3.2