[Libva] [PATCH 1/4] add GPU shader for RGB->NV12 conversion
Zhao Halley
halley.zhao at intel.com
Thu Jul 12 23:32:56 PDT 2012
---
.../gen5_6/Common/RGBX_Load_16x8.asm | 57 ++
.../gen5_6/Common/RGBX_Load_16x8.inc | 48 ++
.../gen5_6/Common/RGBX_Save_YUV_Fix.asm | 115 ++++
.../gen5_6/Common/RGBX_Save_YUV_Float.asm | 152 +++++
.../gen5_6/Common/RGBX_to_YUV_Coef.asm | 43 ++
src/shaders/post_processing/gen5_6/Makefile.am | 8 +
.../post_processing/gen5_6/rgbx_load_save_nv12.asm | 26 +
.../gen5_6/rgbx_load_save_nv12.g4b.gen5 | 562 +++++++++++++++++
.../post_processing/gen5_6/rgbx_load_save_nv12.g6b | 635 ++++++++++++++++++++
9 files changed, 1646 insertions(+), 0 deletions(-)
create mode 100755 src/shaders/post_processing/gen5_6/Common/RGBX_Load_16x8.asm
create mode 100755 src/shaders/post_processing/gen5_6/Common/RGBX_Load_16x8.inc
create mode 100755 src/shaders/post_processing/gen5_6/Common/RGBX_Save_YUV_Fix.asm
create mode 100755 src/shaders/post_processing/gen5_6/Common/RGBX_Save_YUV_Float.asm
create mode 100755 src/shaders/post_processing/gen5_6/Common/RGBX_to_YUV_Coef.asm
create mode 100755 src/shaders/post_processing/gen5_6/rgbx_load_save_nv12.asm
create mode 100644 src/shaders/post_processing/gen5_6/rgbx_load_save_nv12.g4b.gen5
create mode 100644 src/shaders/post_processing/gen5_6/rgbx_load_save_nv12.g6b
diff --git a/src/shaders/post_processing/gen5_6/Common/RGBX_Load_16x8.asm b/src/shaders/post_processing/gen5_6/Common/RGBX_Load_16x8.asm
new file mode 100755
index 0000000..958308a
--- /dev/null
+++ b/src/shaders/post_processing/gen5_6/Common/RGBX_Load_16x8.asm
@@ -0,0 +1,57 @@
+/*
+ * All Video Processing kernels
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+
+// Module name: RGBA_Load_16x8.asm (copied from AYUV_Load_16x8.asm)
+//----------------------------------------------------------------
+
+
+#include "RGBX_Load_16x8.inc"
+
+// In order to load 64x8 RGBA data (16x8 pixels), we need to divide the data
+// into two regions and load them separately.
+//
+// 32 byte 32 byte
+//|----------------|----------------|
+//| | |
+//| A | B |8
+//| | |
+//| | |
+//|----------------|----------------|
+
+// Load the first 32x8 data block
+// Packed data block should be loaded as 32x8 pixel block
+ add (2) rMSGSRC.0<1>:d wORIX<2;2,1>:w wSRC_H_ORI_OFFSET<2;2,1>:w // Source Block origin
+ shl (1) rMSGSRC.0<1>:d rMSGSRC.0<0;1,0>:w 2:w { NoDDClr } // H. block origin need to be four times larger
+ mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_RGBA:ud { NoDDChk } // Block width and height (32x8)
+ mov (8) mMSGHDRY<1>:ud rMSGSRC<8;8,1>:ud
+ send (8) udSRC_RGBA(0)<1> mMSGHDRY udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_RGBA+nBI_CURRENT_SRC_YUV:ud
+
+//Load the second 32x8 data block
+// Offset the origin X - move to next 32 colomns
+ add (1) rMSGSRC.0<1>:d rMSGSRC.0<0;1,0>:d 32:w // Increase X origin by 8
+
+// Size stays the same - 32x8
+ mov (8) mMSGHDRY<1>:ud rMSGSRC<8;8,1>:ud // Copy message description to message header
+ send (8) udSRC_RGBA(8)<1> mMSGHDRY udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_RGBA+nBI_CURRENT_SRC_YUV:ud
+
+// Give AYUV region addresses to address register
+ // a0.0 is 0x38*32, a0.1 is 0x40*32. 0x40-0x38=8 (pixel)
+ mov (1) SRC_RGBA_OFFSET<1>:ud 0x00400038*32:ud //Address registers contain starting addresses of two halves
+
+#if !defined(FIX_POINT_CONVERSION) && !defined(FLOAT_POINT_CONVERSION)
+ //Directly move the data to destination
+ $for(0; <nY_NUM_OF_ROWS; 1) {
+ // 8 means 8 elements, not 2=8/2 element per row.
+ mov (16) uwDEST_Y(%1)<1> r[SRC_RGBA_OFFSET,%1*32+3]<8,4>:ub // A/R
+ mov (16) uwDEST_U(%1)<1> r[SRC_RGBA_OFFSET,%1*32+2]<8,4>:ub // Y/G
+ mov (16) uwDEST_V(%1)<1> r[SRC_RGBA_OFFSET,%1*32+1]<8,4>:ub // U/B
+ }
+#endif
+
diff --git a/src/shaders/post_processing/gen5_6/Common/RGBX_Load_16x8.inc b/src/shaders/post_processing/gen5_6/Common/RGBX_Load_16x8.inc
new file mode 100755
index 0000000..7199d64
--- /dev/null
+++ b/src/shaders/post_processing/gen5_6/Common/RGBX_Load_16x8.inc
@@ -0,0 +1,48 @@
+/*
+ * All Video Processing kernels
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ */
+#ifndef RGBA_LOAD_16X8_INC
+#define RGBA_LOAD_16X8_INC
+
+// Module name: RGBA_Load_16x8.inc
+//
+// RGBA data are first loaded to bottom I/O REGION_2, then does color conversion from RGB to YUV
+// finally, YUV data are stored in top I/O REGION_1 with planar format
+
+#undef nY_NUM_OF_ROWS
+
+#define nY_NUM_OF_ROWS 8 // Number of Y rows per block
+
+#define nDPR_BLOCK_SIZE_RGBA nBLOCK_WIDTH_32+nBLOCK_HEIGHT_8 // RGBA block size 32x8 (it is half size)
+#define nDPR_MSG_SIZE_RGBA nRESLEN_8 // # of MRF's to hold RGBA block data (8)
+
+//Temporary storage for unpacked AYUV data
+#define rUNPACK_TEMP REG(r,nTEMP0)
+.declare udUNPACK_TEMP Base=rUNPACK_TEMP ElementSize=4 SrcRegion=<8;8,1> Type=ud //1 GRF
+.declare ubUNPACK_TEMP Base=rUNPACK_TEMP ElementSize=1 SrcRegion=<32;32,1> Type=ub //1 GRF
+
+.declare ubBOT_Y_IO Base=REG(r,nBOT_Y) ElementSize=1 SrcRegion=REGION(32,1) Type=ub
+
+
+#define udSRC_RGBA udBOT_Y_IO
+#define ubSRC_RGBA ubBOT_Y_IO
+#define nSRC_RGBA_REG nBOT_Y
+
+#define uwDEST_Y uwTOP_Y
+#define uwDEST_U uwTOP_U
+#define uwDEST_V uwTOP_V
+
+#define SRC_RGBA_OFFSET a0.0
+#define SRC_RGBA_OFFSET_1 a0.0
+#define SRC_RGBA_OFFSET_2 a0.1
+
+#define nSRC_REGION nREGION_1 // REGION_1 will be the source region for first kernel
+
+// End of RGBA_Load_16x8.inc
+#endif
\ No newline at end of file
diff --git a/src/shaders/post_processing/gen5_6/Common/RGBX_Save_YUV_Fix.asm b/src/shaders/post_processing/gen5_6/Common/RGBX_Save_YUV_Fix.asm
new file mode 100755
index 0000000..f60a2a0
--- /dev/null
+++ b/src/shaders/post_processing/gen5_6/Common/RGBX_Save_YUV_Fix.asm
@@ -0,0 +1,115 @@
+/*
+ * All Video Processing kernels
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ * Authors:
+ * Halley Zhao <halley.zhao at intel.com>
+ */
+
+// Module name: PL16x8_PL8x4.asm
+//----------------------------------------------------------------
+
+#include "RGBX_Load_16x8.inc"
+
+#if (0)
+ #define nTEMP0 34 // transformation coefficient
+ #define nTEMP1 35 // one row of Y (first half register is used)
+ #define nTEMP2 36 // first half of one row
+ #define nTEMP3 37 // second half of one row
+ #define nTEMP4 38 // mul and add
+ #define nTEMP5 39 // mul and add
+ #define nTEMP6 40 // mul and add
+ #define nTEMP7 41 // mul and add
+ #define nTEMP8 42 // sum of mul
+ #define nTEMP10 44
+ #define nTEMP12 46
+ #define nTEMP14 48
+ #define nTEMP16 50
+ #define nTEMP17 51
+ #define nTEMP18 52
+
+ #define nTEMP24 58
+#endif
+
+$for(0; <nY_NUM_OF_ROWS; 1) {
+ // BGRX | B | G | R | X |
+ // ###### do on row for Y
+ // #### mul and add
+ mul (16) REG2(r, nTEMP4, 0)<1>:uw r[SRC_RGBA_OFFSET_1, %1*32 + 0]<0; 16,1>:ub REG2(r, nTEMP0, 0)<0;4,1>:ub
+ mul (16) REG2(r, nTEMP5, 0)<1>:uw r[SRC_RGBA_OFFSET_1, %1*32 + 16]<0; 16,1>:ub REG2(r, nTEMP0, 0)<0;4,1>:ub
+ mul (16) REG2(r, nTEMP6, 0)<1>:uw r[SRC_RGBA_OFFSET_2, %1*32 + 0]<0; 16,1>:ub REG2(r, nTEMP0, 0)<0;4,1>:ub
+ mul (16) REG2(r, nTEMP7, 0)<1>:uw r[SRC_RGBA_OFFSET_2, %1*32 + 16]<0; 16,1>:ub REG2(r, nTEMP0, 0)<0;4,1>:ub
+
+ add (4) REG2(r, nTEMP4, 0)<4>:uw REG2(r, nTEMP4, 0)<0;4,4>:uw REG2(r, nTEMP4, 1)<0;4,4>:uw
+ add (4) REG2(r, nTEMP4, 0)<4>:uw REG2(r, nTEMP4, 0)<0;4,4>:uw REG2(r, nTEMP4, 2)<0;4,4>:uw
+ add (4) REG2(r, nTEMP5, 0)<4>:uw REG2(r, nTEMP5, 0)<0;4,4>:uw REG2(r, nTEMP5, 1)<0;4,4>:uw
+ add (4) REG2(r, nTEMP5, 0)<4>:uw REG2(r, nTEMP5, 0)<0;4,4>:uw REG2(r, nTEMP5, 2)<0;4,4>:uw
+ add (4) REG2(r, nTEMP6, 0)<4>:uw REG2(r, nTEMP6, 0)<0;4,4>:uw REG2(r, nTEMP6, 1)<0;4,4>:uw
+ add (4) REG2(r, nTEMP6, 0)<4>:uw REG2(r, nTEMP6, 0)<0;4,4>:uw REG2(r, nTEMP6, 2)<0;4,4>:uw
+ add (4) REG2(r, nTEMP7, 0)<4>:uw REG2(r, nTEMP7, 0)<0;4,4>:uw REG2(r, nTEMP7, 1)<0;4,4>:uw
+ add (4) REG2(r, nTEMP7, 0)<4>:uw REG2(r, nTEMP7, 0)<0;4,4>:uw REG2(r, nTEMP7, 2)<0;4,4>:uw
+
+ // #### write Y to the 1 row
+ mov (4) REG2(r, nTEMP8, 0)<1>:uw REG2(r, nTEMP4, 0)<0; 4, 4>:uw
+ mov (4) REG2(r, nTEMP8, 4)<1>:uw REG2(r, nTEMP5, 0)<0; 4, 4>:uw
+ mov (4) REG2(r, nTEMP8, 8)<1>:uw REG2(r, nTEMP6, 0)<0; 4, 4>:uw
+ mov (4) REG2(r, nTEMP8, 12)<1>:uw REG2(r, nTEMP7, 0)<0; 4, 4>:uw
+ add (16) REG2(r, nTEMP8, 0)<1>:uw REG2(r, nTEMP8, 0)<0; 16, 1>:uw 0x1080:uw
+ mov (16) REG2(r, nTEMP8, 0)<1>:ub REG2(r, nTEMP8, 1)<0; 16, 2>:ub
+ mov (16) uwDEST_Y(%1)<1> REG2(r,nTEMP8, 0)<0;16,1>:ub
+
+ // ###### do one row for U
+ // #### mul and add
+ mul (16) REG2(r, nTEMP4, 0)<1>:w r[SRC_RGBA_OFFSET_1, %1*32 + 0]<0; 16,1>:ub REG2(r, nTEMP0, 4)<0;4,1>:b
+ mul (16) REG2(r, nTEMP5, 0)<1>:w r[SRC_RGBA_OFFSET_1, %1*32 + 16]<0; 16,1>:ub REG2(r, nTEMP0, 4)<0;4,1>:b
+ mul (16) REG2(r, nTEMP6, 0)<1>:w r[SRC_RGBA_OFFSET_2, %1*32 + 0]<0; 16,1>:ub REG2(r, nTEMP0, 4)<0;4,1>:b
+ mul (16) REG2(r, nTEMP7, 0)<1>:w r[SRC_RGBA_OFFSET_2, %1*32 + 16]<0; 16,1>:ub REG2(r, nTEMP0, 4)<0;4,1>:b
+
+ add (4) REG2(r, nTEMP4, 0)<4>:w REG2(r, nTEMP4, 0)<0;4,4>:w REG2(r, nTEMP4, 1)<0;4,4>:w
+ add (4) REG2(r, nTEMP4, 0)<4>:w REG2(r, nTEMP4, 0)<0;4,4>:w REG2(r, nTEMP4, 2)<0;4,4>:w
+ add (4) REG2(r, nTEMP5, 0)<4>:w REG2(r, nTEMP5, 0)<0;4,4>:w REG2(r, nTEMP5, 1)<0;4,4>:w
+ add (4) REG2(r, nTEMP5, 0)<4>:w REG2(r, nTEMP5, 0)<0;4,4>:w REG2(r, nTEMP5, 2)<0;4,4>:w
+ add (4) REG2(r, nTEMP6, 0)<4>:w REG2(r, nTEMP6, 0)<0;4,4>:w REG2(r, nTEMP6, 1)<0;4,4>:w
+ add (4) REG2(r, nTEMP6, 0)<4>:w REG2(r, nTEMP6, 0)<0;4,4>:w REG2(r, nTEMP6, 2)<0;4,4>:w
+ add (4) REG2(r, nTEMP7, 0)<4>:w REG2(r, nTEMP7, 0)<0;4,4>:w REG2(r, nTEMP7, 1)<0;4,4>:w
+ add (4) REG2(r, nTEMP7, 0)<4>:w REG2(r, nTEMP7, 0)<0;4,4>:w REG2(r, nTEMP7, 2)<0;4,4>:w
+
+ // #### write U to the 1 row
+ mov (4) REG2(r, nTEMP8, 0)<1>:w REG2(r, nTEMP4, 0)<0; 4, 4>:w
+ mov (4) REG2(r, nTEMP8, 4)<1>:w REG2(r, nTEMP5, 0)<0; 4, 4>:w
+ mov (4) REG2(r, nTEMP8, 8)<1>:w REG2(r, nTEMP6, 0)<0; 4, 4>:w
+ mov (4) REG2(r, nTEMP8, 12)<1>:w REG2(r, nTEMP7, 0)<0; 4, 4>:w
+ add (16) REG2(r, nTEMP8, 0)<1>:uw REG2(r, nTEMP8, 0)<0; 16, 1>:w 0x8080:uw // ok?
+ mov (16) REG2(r, nTEMP8, 0)<1>:ub REG2(r, nTEMP8, 1)<0; 16, 2>:ub
+ mov (16) uwDEST_U(%1)<1> REG2(r,nTEMP8, 0)<0;16,1>:ub
+
+ // ###### do one row for V
+ // #### mul and add
+ mul (16) REG2(r, nTEMP4, 0)<1>:w r[SRC_RGBA_OFFSET_1, %1*32 + 0]<0; 16,1>:ub REG2(r, nTEMP0, 8)<0;4,1>:b
+ mul (16) REG2(r, nTEMP5, 0)<1>:w r[SRC_RGBA_OFFSET_1, %1*32 + 16]<0; 16,1>:ub REG2(r, nTEMP0, 8)<0;4,1>:b
+ mul (16) REG2(r, nTEMP6, 0)<1>:w r[SRC_RGBA_OFFSET_2, %1*32 + 0]<0; 16,1>:ub REG2(r, nTEMP0, 8)<0;4,1>:b
+ mul (16) REG2(r, nTEMP7, 0)<1>:w r[SRC_RGBA_OFFSET_2, %1*32 + 16]<0; 16,1>:ub REG2(r, nTEMP0, 8)<0;4,1>:b
+
+ add (4) REG2(r, nTEMP4, 0)<4>:w REG2(r, nTEMP4, 0)<0;4,4>:w REG2(r, nTEMP4, 1)<0;4,4>:w
+ add (4) REG2(r, nTEMP4, 0)<4>:w REG2(r, nTEMP4, 0)<0;4,4>:w REG2(r, nTEMP4, 2)<0;4,4>:w
+ add (4) REG2(r, nTEMP5, 0)<4>:w REG2(r, nTEMP5, 0)<0;4,4>:w REG2(r, nTEMP5, 1)<0;4,4>:w
+ add (4) REG2(r, nTEMP5, 0)<4>:w REG2(r, nTEMP5, 0)<0;4,4>:w REG2(r, nTEMP5, 2)<0;4,4>:w
+ add (4) REG2(r, nTEMP6, 0)<4>:w REG2(r, nTEMP6, 0)<0;4,4>:w REG2(r, nTEMP6, 1)<0;4,4>:w
+ add (4) REG2(r, nTEMP6, 0)<4>:w REG2(r, nTEMP6, 0)<0;4,4>:w REG2(r, nTEMP6, 2)<0;4,4>:w
+ add (4) REG2(r, nTEMP7, 0)<4>:w REG2(r, nTEMP7, 0)<0;4,4>:w REG2(r, nTEMP7, 1)<0;4,4>:w
+ add (4) REG2(r, nTEMP7, 0)<4>:w REG2(r, nTEMP7, 0)<0;4,4>:w REG2(r, nTEMP7, 2)<0;4,4>:w
+
+ // #### write V to the 1 row
+ mov (4) REG2(r, nTEMP8, 0)<1>:w REG2(r, nTEMP4, 0)<0; 4, 4>:w
+ mov (4) REG2(r, nTEMP8, 4)<1>:w REG2(r, nTEMP5, 0)<0; 4, 4>:w
+ mov (4) REG2(r, nTEMP8, 8)<1>:w REG2(r, nTEMP6, 0)<0; 4, 4>:w
+ mov (4) REG2(r, nTEMP8, 12)<1>:w REG2(r, nTEMP7, 0)<0; 4, 4>:w
+ add (16) REG2(r, nTEMP8, 0)<1>:uw REG2(r, nTEMP8, 0)<0; 16, 1>:w 0x8080:uw // ok?
+ mov (16) REG2(r, nTEMP8, 0)<1>:ub REG2(r, nTEMP8, 1)<0; 16, 2>:ub
+ mov (16) uwDEST_V(%1)<1> REG2(r,nTEMP8, 0)<0;16,1>:ub
+}
+
diff --git a/src/shaders/post_processing/gen5_6/Common/RGBX_Save_YUV_Float.asm b/src/shaders/post_processing/gen5_6/Common/RGBX_Save_YUV_Float.asm
new file mode 100755
index 0000000..a771187
--- /dev/null
+++ b/src/shaders/post_processing/gen5_6/Common/RGBX_Save_YUV_Float.asm
@@ -0,0 +1,152 @@
+/*
+ * All Video Processing kernels
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ * Authors:
+ * Halley Zhao <halley.zhao at intel.com>
+ */
+
+// Module name: RGBX_Save_YUV_Float.asm
+//----------------------------------------------------------------
+
+#include "RGBX_Load_16x8.inc"
+
+#if (0)
+ // 8 grf reg for one row of pixel (2 pixel per grf)
+ #define nTEMP0 34
+ #define nTEMP1 35
+ #define nTEMP2 36
+ #define nTEMP3 37
+ #define nTEMP4 38
+ #define nTEMP5 39
+ #define nTEMP6 40
+ #define nTEMP7 41
+
+ #define nTEMP8 42 // transformation coefficient
+ #define nTEMP10 44 // transformation coefficient
+
+ #define nTEMP12 46 // save Y/U/V in ub format
+ #define nTEMP14 48 // save YUV in ud format
+ #define nTEMP16 50 // dp4 result
+ #define nTEMP17 51
+ #define nTEMP18 52
+
+ #define nTEMP24 58
+#endif
+
+$for(0; <nY_NUM_OF_ROWS; 1) {
+ // BGRX | B | G | R | X |
+ // ###### save one row of pixel to temp grf with float format (required by dp4)
+ // mov (8) doesn't work, puzzle
+ mov (4) REG(r, nTEMP0)<1>:f r[SRC_RGBA_OFFSET_1,%1*32 + 0]<4,1>:ub
+ mov (4) REG(r, nTEMP1)<1>:f r[SRC_RGBA_OFFSET_1,%1*32 + 8]<4,1>:ub
+ mov (4) REG(r, nTEMP2)<1>:f r[SRC_RGBA_OFFSET_1,%1*32 + 16]<4,1>:ub
+ mov (4) REG(r, nTEMP3)<1>:f r[SRC_RGBA_OFFSET_1,%1*32 + 24]<4,1>:ub
+ mov (4) REG(r, nTEMP4)<1>:f r[SRC_RGBA_OFFSET_2,%1*32 + 0]<4,1>:ub
+ mov (4) REG(r, nTEMP5)<1>:f r[SRC_RGBA_OFFSET_2,%1*32 + 8]<4,1>:ub
+ mov (4) REG(r, nTEMP6)<1>:f r[SRC_RGBA_OFFSET_2,%1*32 + 16]<4,1>:ub
+ mov (4) REG(r, nTEMP7)<1>:f r[SRC_RGBA_OFFSET_2,%1*32 + 24]<4,1>:ub
+ mov (4) REG2(r, nTEMP0, 4)<1>:f r[SRC_RGBA_OFFSET_1,%1*32 + 4]<4,1>:ub
+ mov (4) REG2(r, nTEMP1, 4)<1>:f r[SRC_RGBA_OFFSET_1,%1*32 + 12]<4,1>:ub
+ mov (4) REG2(r, nTEMP2, 4)<1>:f r[SRC_RGBA_OFFSET_1,%1*32 + 20]<4,1>:ub
+ mov (4) REG2(r, nTEMP3, 4)<1>:f r[SRC_RGBA_OFFSET_1,%1*32 + 28]<4,1>:ub
+ mov (4) REG2(r, nTEMP4, 4)<1>:f r[SRC_RGBA_OFFSET_2,%1*32 + 4]<4,1>:ub
+ mov (4) REG2(r, nTEMP5, 4)<1>:f r[SRC_RGBA_OFFSET_2,%1*32 + 12]<4,1>:ub
+ mov (4) REG2(r, nTEMP6, 4)<1>:f r[SRC_RGBA_OFFSET_2,%1*32 + 20]<4,1>:ub
+ mov (4) REG2(r, nTEMP7, 4)<1>:f r[SRC_RGBA_OFFSET_2,%1*32 + 24]<4,1>:ub
+
+ // ###### do one row for Y
+ // ##### dp4(nTEMP16) and save result to uw format(nTEMP12)
+ dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP0, 0)<0;8,1>:f REG2(r, nTEMP8, 0)<0;4,1>:f
+ mov (2) REG2(r, nTEMP14, 0)<1>:ud REG2(r, nTEMP16, 0)<0;2,4>:f
+ mov (2) REG2(r, nTEMP12, 0)<1>:ub REG2(r, nTEMP14, 0)<0;2,4>:ub
+ dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP1, 0)<0;8,1>:f REG2(r, nTEMP8, 0)<0;4,1>:f
+ mov (2) REG2(r, nTEMP14, 0)<1>:ud REG2(r, nTEMP16, 0)<0;2,4>:f
+ mov (2) REG2(r, nTEMP12, 2)<1>:ub REG2(r, nTEMP14, 0)<0;2,4>:ub
+ dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP2, 0)<0;8,1>:f REG2(r, nTEMP8, 0)<0;4,1>:f
+ mov (2) REG2(r, nTEMP14, 0)<1>:ud REG2(r, nTEMP16, 0)<0;2,4>:f
+ mov (2) REG2(r, nTEMP12, 4)<1>:ub REG2(r, nTEMP14, 0)<0;2,4>:ub
+ dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP3, 0)<0;8,1>:f REG2(r, nTEMP8, 0)<0;4,1>:f
+ mov (2) REG2(r, nTEMP14, 0)<1>:ud REG2(r, nTEMP16, 0)<0;2,4>:f
+ mov (2) REG2(r, nTEMP12, 6)<1>:ub REG2(r, nTEMP14, 0)<0;2,4>:ub
+ dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP4, 0)<0;8,1>:f REG2(r, nTEMP8, 0)<0;4,1>:f
+ mov (2) REG2(r, nTEMP14, 0)<1>:ud REG2(r, nTEMP16, 0)<0;2,4>:f
+ mov (2) REG2(r, nTEMP12, 8)<1>:ub REG2(r, nTEMP14, 0)<0;2,4>:ub
+ dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP5, 0)<0;8,1>:f REG2(r, nTEMP8, 0)<0;4,1>:f
+ mov (2) REG2(r, nTEMP14, 0)<1>:ud REG2(r, nTEMP16, 0)<0;2,4>:f
+ mov (2) REG2(r, nTEMP12, 10)<1>:ub REG2(r, nTEMP14, 0)<0;2,4>:ub
+ dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP6, 0)<0;8,1>:f REG2(r, nTEMP8, 0)<0;4,1>:f
+ mov (2) REG2(r, nTEMP14, 0)<1>:ud REG2(r, nTEMP16, 0)<0;2,4>:f
+ mov (2) REG2(r, nTEMP12, 12)<1>:ub REG2(r, nTEMP14, 0)<0;2,4>:ub
+ dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP7, 0)<0;8,1>:f REG2(r, nTEMP8, 0)<0;4,1>:f
+ mov (2) REG2(r, nTEMP14, 0)<1>:ud REG2(r, nTEMP16, 0)<0;2,4>:f
+ mov (2) REG2(r, nTEMP12, 14)<1>:ub REG2(r, nTEMP14, 0)<0;2,4>:ub
+
+ // #### write Y to the 1 row
+ mov (16) uwDEST_Y(%1)<1> REG2(r,nTEMP12, 0)<0;16,1>:ub
+
+ // ###### do one row for U
+ // ##### dp4(nTEMP16) and save result to uw format(nTEMP12)
+ dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP0, 0)<0;8,1>:f REG2(r, nTEMP8, 4)<0;4,1>:f
+ mov (2) REG2(r, nTEMP14, 0)<1>:d REG2(r, nTEMP16, 0)<0;2,4>:f
+ mov (2) REG2(r, nTEMP12, 0)<1>:w REG2(r, nTEMP14, 0)<0;2,2>:w
+ dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP1, 0)<0;8,1>:f REG2(r, nTEMP8, 4)<0;4,1>:f
+ mov (2) REG2(r, nTEMP14, 0)<1>:d REG2(r, nTEMP16, 0)<0;2,4>:f
+ mov (2) REG2(r, nTEMP12, 2)<1>:w REG2(r, nTEMP14, 0)<0;2,2>:w
+ dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP2, 0)<0;8,1>:f REG2(r, nTEMP8, 4)<0;4,1>:f
+ mov (2) REG2(r, nTEMP14, 0)<1>:d REG2(r, nTEMP16, 0)<0;2,4>:f
+ mov (2) REG2(r, nTEMP12, 4)<1>:w REG2(r, nTEMP14, 0)<0;2,2>:w
+ dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP3, 0)<0;8,1>:f REG2(r, nTEMP8, 4)<0;4,1>:f
+ mov (2) REG2(r, nTEMP14, 0)<1>:d REG2(r, nTEMP16, 0)<0;2,4>:f
+ mov (2) REG2(r, nTEMP12, 6)<1>:w REG2(r, nTEMP14, 0)<0;2,2>:w
+ dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP4, 0)<0;8,1>:f REG2(r, nTEMP8, 4)<0;4,1>:f
+ mov (2) REG2(r, nTEMP14, 0)<1>:d REG2(r, nTEMP16, 0)<0;2,4>:f
+ mov (2) REG2(r, nTEMP12, 8)<1>:w REG2(r, nTEMP14, 0)<0;2,2>:w
+ dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP5, 0)<0;8,1>:f REG2(r, nTEMP8, 4)<0;4,1>:f
+ mov (2) REG2(r, nTEMP14, 0)<1>:d REG2(r, nTEMP16, 0)<0;2,4>:f
+ mov (2) REG2(r, nTEMP12, 10)<1>:w REG2(r, nTEMP14, 0)<0;2,2>:w
+ dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP6, 0)<0;8,1>:f REG2(r, nTEMP8, 4)<0;4,1>:f
+ mov (2) REG2(r, nTEMP14, 0)<1>:d REG2(r, nTEMP16, 0)<0;2,4>:f
+ mov (2) REG2(r, nTEMP12, 12)<1>:w REG2(r, nTEMP14, 0)<0;2,2>:w
+ dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP7, 0)<0;8,1>:f REG2(r, nTEMP8, 4)<0;4,1>:f
+ mov (2) REG2(r, nTEMP14, 0)<1>:d REG2(r, nTEMP16, 0)<0;2,4>:f
+ mov (2) REG2(r, nTEMP12, 14)<1>:w REG2(r, nTEMP14, 0)<0;2,2>:w
+ add (16) REG2(r, nTEMP12, 0)<1>:w REG2(r, nTEMP12, 0)<0;16,1>:w 128:w
+ // #### write U to the 1 row
+ mov (16) uwDEST_U(%1)<1> REG2(r,nTEMP12, 0)<0;16,2>:ub
+
+ // ###### do one row for V
+ // ##### dp4(nTEMP16) and save result to uw format(nTEMP12)
+ dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP0, 0)<0;8,1>:f REG2(r, nTEMP10, 0)<0;4,1>:f
+ mov (2) REG2(r, nTEMP14, 0)<1>:d REG2(r, nTEMP16, 0)<0;2,4>:f
+ mov (2) REG2(r, nTEMP12, 0)<1>:w REG2(r, nTEMP14, 0)<0;2,2>:w
+ dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP1, 0)<0;8,1>:f REG2(r, nTEMP10, 0)<0;4,1>:f
+ mov (2) REG2(r, nTEMP14, 0)<1>:d REG2(r, nTEMP16, 0)<0;2,4>:f
+ mov (2) REG2(r, nTEMP12, 2)<1>:w REG2(r, nTEMP14, 0)<0;2,2>:w
+ dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP2, 0)<0;8,1>:f REG2(r, nTEMP10, 0)<0;4,1>:f
+ mov (2) REG2(r, nTEMP14, 0)<1>:d REG2(r, nTEMP16, 0)<0;2,4>:f
+ mov (2) REG2(r, nTEMP12, 4)<1>:w REG2(r, nTEMP14, 0)<0;2,2>:w
+ dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP3, 0)<0;8,1>:f REG2(r, nTEMP10, 0)<0;4,1>:f
+ mov (2) REG2(r, nTEMP14, 0)<1>:d REG2(r, nTEMP16, 0)<0;2,4>:f
+ mov (2) REG2(r, nTEMP12, 6)<1>:w REG2(r, nTEMP14, 0)<0;2,2>:w
+ dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP4, 0)<0;8,1>:f REG2(r, nTEMP10, 0)<0;4,1>:f
+ mov (2) REG2(r, nTEMP14, 0)<1>:d REG2(r, nTEMP16, 0)<0;2,4>:f
+ mov (2) REG2(r, nTEMP12, 8)<1>:w REG2(r, nTEMP14, 0)<0;2,2>:w
+ dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP5, 0)<0;8,1>:f REG2(r, nTEMP10, 0)<0;4,1>:f
+ mov (2) REG2(r, nTEMP14, 0)<1>:d REG2(r, nTEMP16, 0)<0;2,4>:f
+ mov (2) REG2(r, nTEMP12, 10)<1>:w REG2(r, nTEMP14, 0)<0;2,2>:w
+ dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP6, 0)<0;8,1>:f REG2(r, nTEMP10, 0)<0;4,1>:f
+ mov (2) REG2(r, nTEMP14, 0)<1>:d REG2(r, nTEMP16, 0)<0;2,4>:f
+ mov (2) REG2(r, nTEMP12, 12)<1>:w REG2(r, nTEMP14, 0)<0;2,2>:w
+ dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP7, 0)<0;8,1>:f REG2(r, nTEMP10, 0)<0;4,1>:f
+ mov (2) REG2(r, nTEMP14, 0)<1>:d REG2(r, nTEMP16, 0)<0;2,4>:f
+ mov (2) REG2(r, nTEMP12, 14)<1>:w REG2(r, nTEMP14, 0)<0;2,2>:w
+ add (16) REG2(r, nTEMP12, 0)<1>:w REG2(r, nTEMP12, 0)<0;16,1>:w 128:w
+
+ // #### write V to the 1 row
+ mov (16) uwDEST_V(%1)<1> REG2(r,nTEMP12, 0)<0;16,2>:ub
+}
diff --git a/src/shaders/post_processing/gen5_6/Common/RGBX_to_YUV_Coef.asm b/src/shaders/post_processing/gen5_6/Common/RGBX_to_YUV_Coef.asm
new file mode 100755
index 0000000..1f58643
--- /dev/null
+++ b/src/shaders/post_processing/gen5_6/Common/RGBX_to_YUV_Coef.asm
@@ -0,0 +1,43 @@
+/*
+ * All Video Processing kernels
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ * Authors:
+ * Halley Zhao <halley.zhao at intel.com>
+ */
+
+// Module name: RGB_to_YUV_Coef.asm
+//----------------------------------------------------------------
+
+#ifdef FIX_POINT_CONVERSION
+ // Y = ( ( 66 * R + 129 * G + 25 * B + 128 ) >> 8) + 16
+ mov (1) REG2(r, nTEMP0, 0):ud 0x00428119:ud // used as unsigned byte
+ // U = ( ( -38 * R - 74 * G + 112 * B + 128 ) >> 8) + 128
+ mov (1) REG2(r, nTEMP0, 1):ud 0x00DAB670:ud // used as signed byte
+ // V = ( ( 112 * R - 94 * G - 18 * B + 128 ) >> 8) + 128
+ mov (1) REG2(r, nTEMP0, 2):ud 0x0070A2EEud // used as signed byte
+#else
+ // Y = 0.299R + 0.587G + 0.114B
+ mov (1) REG2(r, nTEMP8, 0):f 0.114f // B coef
+ mov (1) REG2(r, nTEMP8, 1):f 0.587f // G coef
+ mov (1) REG2(r, nTEMP8, 2):f 0.299f // R coef
+ mov (1) REG2(r, nTEMP8, 3):f 0.000f // A coef
+
+ // Cb= -0.169R - 0.331G + 0.499B + 128
+ // U = -0.147R - 0.289G + 0.436B + 128
+ mov (1) REG2(r, nTEMP8, 4):f 0.436f // B coef
+ mov (1) REG2(r, nTEMP8, 5):f -0.289f // G coef
+ mov (1) REG2(r, nTEMP8, 6):f -0.147f // R coef
+ mov (1) REG2(r, nTEMP8, 7):f 0.000f // A coef
+ // Cr= 0.499R - 0.418G - 0.0813B+ 128
+ // V = 0.615R - 0.515G - 0.100B + 128
+ mov (1) REG2(r, nTEMP10, 0):f -0.100f // B coef
+ mov (1) REG2(r, nTEMP10, 1):f -0.515f // G coef
+ mov (1) REG2(r, nTEMP10, 2):f 0.615f // R coef
+ mov (1) REG2(r, nTEMP10, 3):f 0.000f // A coef
+#endif
+
diff --git a/src/shaders/post_processing/gen5_6/Makefile.am b/src/shaders/post_processing/gen5_6/Makefile.am
index 1cc1ecb..8658938 100755
--- a/src/shaders/post_processing/gen5_6/Makefile.am
+++ b/src/shaders/post_processing/gen5_6/Makefile.am
@@ -20,6 +20,7 @@ INTEL_PP_G4B_GEN5 = \
pl3_load_save_pa.g4b.gen5 \
pa_load_save_nv12.g4b.gen5 \
pa_load_save_pl3.g4b.gen5 \
+ rgbx_load_save_nv12.g4b.gen5 \
$(NULL)
INTEL_PP_G6B = \
@@ -35,6 +36,7 @@ INTEL_PP_G6B = \
pl3_load_save_pa.g6b \
pa_load_save_nv12.g6b \
pa_load_save_pl3.g6b \
+ rgbx_load_save_nv12.g6b \
$(NULL)
INTEL_PP_ASM = \
@@ -50,6 +52,7 @@ INTEL_PP_ASM = \
pl3_load_save_pa.asm \
pa_load_save_nv12.asm \
pa_load_save_pl3.asm \
+ rgbx_load_save_nv12.asm \
$(NULL)
INTEL_PP_ASM += \
@@ -86,6 +89,10 @@ INTEL_PP_ASM += \
Common/RGB16x8_Save_RGB16.asm \
Common/RGB16x8_Save_Y416.asm \
Common/RGB_Pack.asm \
+ Common/RGBX_Load_16x8.asm \
+ Common/RGBX_to_YUV_Coef.asm \
+ Common/RGBX_Save_YUV_Fix.asm \
+ Common/RGBX_Save_YUV_Float.asm \
Common/SetupVPKernel.asm \
Common/readSampler16x1.asm \
Core_Kernels/AVS_SetupFirstBlock.asm \
@@ -145,6 +152,7 @@ INTEL_PP_INC = \
Common/RGB16x8_Save_RGB.inc \
Common/RGB16x8_Save_RGB16.inc \
Common/RGB16x8_Save_Y416.inc \
+ Common/RGBX_Load_16x8.inc \
Common/common.inc \
Common/undefall.inc \
Core_Kernels/AVS_IEF.inc \
diff --git a/src/shaders/post_processing/gen5_6/rgbx_load_save_nv12.asm b/src/shaders/post_processing/gen5_6/rgbx_load_save_nv12.asm
new file mode 100755
index 0000000..4922cc7
--- /dev/null
+++ b/src/shaders/post_processing/gen5_6/rgbx_load_save_nv12.asm
@@ -0,0 +1,26 @@
+// Module name: RGBX_LOAD_SAVE_NV12
+.kernel RGBX_LOAD_SAVE_NV12
+.code
+#define FIX_POINT_CONVERSION
+// #define FLOAT_POINT_CONVERSION
+
+#include "SetupVPKernel.asm"
+#include "RGBX_to_YUV_Coef.asm"
+#include "Multiple_Loop_Head.asm"
+#include "RGBX_Load_16x8.asm"
+#ifdef FIX_POINT_CONVERSION
+ #include "RGBX_Save_YUV_Fix.asm"
+#else
+ #include "RGBX_Save_YUV_Float.asm"
+#endif
+#include "PL16x8_PL8x4.asm"
+#include "PL8x4_Save_NV12.asm"
+#include "Multiple_Loop.asm"
+
+END_THREAD // End of Thread
+
+.end_code
+
+.end_kernel
+
+// end of rgbx_load_save_nv12.asm
--
1.7.4.1
More information about the Libva
mailing list