[Beignet] [PATCH] libocl: Directly scalarize built-in with vector input.
Ruiling Song
ruiling.song at intel.com
Sun Feb 15 00:09:07 PST 2015
This revert the following commit:
"Re-apply "improve the build performance of vector type built-in function.""
commitId: 06cce8178649759e12a3a353f0550189d371871b.
I finally decide to do this because although below kind of program has less
instructions and less compile-time, but it will also introduce extra memory access,
which would cause bad run-time performance if the loop is not unrolled. If the loop
is unrolled, it would be similar like scalarized version.
OVERLOADABLE float16 func (float16 param0)
{
union{
float va[16];
float16 vv16;
}uret;
union{
float pa[16];
float16 pv16;
}usrc0;
usrc0.pv16 = param0;
for(int i =0; i < 16; i++)
uret.va[i] = func(usrc0.pa[i]);
return uret.vv16;
}
I did some experiment on the affected built-in. I fixed the GPU frequency at 1050,
and increase input data to 862000. The result is like below (obviously the scalarized
version has better performance):
bultin_asinh_float16:
loop version: 200ms
scalarized version: 150ms
builtin_sinh_float16:
loop version: 250ms
scalarized version: 160ms
And also this patch would reduce the generation of large integer. Although we support
large integer legalization, I find sometime it is hard to legalize in very efficient way
like large integer LE/GT.
---
backend/src/libocl/script/gen_vector.py | 45 +++++--------------------------
1 file changed, 6 insertions(+), 39 deletions(-)
diff --git a/backend/src/libocl/script/gen_vector.py b/backend/src/libocl/script/gen_vector.py
index 291dd87..ffc573a 100755
--- a/backend/src/libocl/script/gen_vector.py
+++ b/backend/src/libocl/script/gen_vector.py
@@ -289,42 +289,9 @@ class builtinProto():
formatStr += ';'
self.append(formatStr)
return formatStr
- if self.functionName != 'select' and ptypeSeqs[0] == ptypeSeqs[self.paramCount-1] and ptype[1] > 4:
- formatStr += '\n{ \n union{'
- formatStr = self.append(formatStr, ' {0} va[{1}];'.format(vtype[0], vtype[1]))
- formatStr = self.append(formatStr, ' {0}{1} vv{2};'.format(vtype[0], vtype[1], vtype[1]))
- formatStr += '\n }uret;'
- formatStr += '\n union{'
- formatStr = self.append(formatStr, ' {0} pa[{1}];'.format(ptype[0], ptype[1]))
- formatStr = self.append(formatStr, ' {0}{1} pv{2};'.format(ptype[0], ptype[1], ptype[1]))
- formatStr += '\n }'
- for n in range(0, self.paramCount):
- formatStr += 'usrc{0}'.format(n)
- if n+1 != self.paramCount:
- formatStr +=', '
- formatStr += ';'
-
- for n in range(0, self.paramCount):
- formatStr = self.append(formatStr, ' usrc{0}.pv{1} = param{2};'.format(n, ptype[1], n))
- formatStr = self.append(formatStr, ' for(int i =0; i < {0}; i++)'.format(ptype[1]))
- formatStr += '\n uret.va[i] = '
- if self.prefix == 'relational' and self.functionName != 'bitselect' and self.functionName != 'select':
- formatStr += '-'
- formatStr += '{0}('.format(self.functionName)
-
- for n in range(0, self.paramCount):
- formatStr += 'usrc{0}.pa[i]'.format(n)
- if n+1 != self.paramCount:
- formatStr +=', '
- formatStr += ');'
- formatStr = self.append(formatStr, ' return uret.vv{0};'.format(vtype[1]))
- formatStr += '\n}'
- formatStr = self.append(formatStr)
- return formatStr
- else:
- formatStr = self.append(formatStr, '{{return ({0}{1})('.format(vtype[0], vtype[1]))
- self.indent = len(formatStr)
- for j in range(0, vtype[1]):
+ formatStr = self.append(formatStr, '{{return ({0}{1})('.format(vtype[0], vtype[1]))
+ self.indent = len(formatStr)
+ for j in range(0, vtype[1]):
if (j != 0):
formatStr += ','
if (j + 1) % 2 == 0:
@@ -359,10 +326,10 @@ class builtinProto():
formatStr += ')'
- formatStr += '); }\n'
- self.append(formatStr)
+ formatStr += '); }\n'
+ self.append(formatStr)
- return formatStr
+ return formatStr
def output(self):
for line in self.outputStr:
--
1.7.10.4
More information about the Beignet
mailing list