[Libva] [PATCH 3/4] add GPU shader for NV12->RGBX conversion
Zhao Halley
halley.zhao at intel.com
Thu Jul 12 23:33:29 PDT 2012
---
.../gen5_6/Common/NV12_Load_8x4.asm | 32 +-
.../post_processing/gen5_6/Common/PL2_Load.inc | 3 +
.../gen5_6/Common/YUVX_Save_RGBX_Fix.asm | 178 +++++
.../gen5_6/Common/YUVX_Save_RGBX_Float.asm | 287 ++++++++
.../gen5_6/Common/YUV_to_RGBX_Coef.asm | 65 ++
src/shaders/post_processing/gen5_6/Makefile.am | 6 +
.../post_processing/gen5_6/nv12_load_save_rgbx.asm | 25 +
.../gen5_6/nv12_load_save_rgbx.g4b.gen5 | 648 ++++++++++++++++++
.../post_processing/gen5_6/nv12_load_save_rgbx.g6b | 721 ++++++++++++++++++++
9 files changed, 1963 insertions(+), 2 deletions(-)
mode change 100644 => 100755 src/shaders/post_processing/gen5_6/Common/NV12_Load_8x4.asm
mode change 100644 => 100755 src/shaders/post_processing/gen5_6/Common/PL2_Load.inc
create mode 100755 src/shaders/post_processing/gen5_6/Common/YUVX_Save_RGBX_Fix.asm
create mode 100755 src/shaders/post_processing/gen5_6/Common/YUVX_Save_RGBX_Float.asm
create mode 100755 src/shaders/post_processing/gen5_6/Common/YUV_to_RGBX_Coef.asm
create mode 100755 src/shaders/post_processing/gen5_6/nv12_load_save_rgbx.asm
create mode 100644 src/shaders/post_processing/gen5_6/nv12_load_save_rgbx.g4b.gen5
create mode 100644 src/shaders/post_processing/gen5_6/nv12_load_save_rgbx.g6b
diff --git a/src/shaders/post_processing/gen5_6/Common/NV12_Load_8x4.asm b/src/shaders/post_processing/gen5_6/Common/NV12_Load_8x4.asm
old mode 100644
new mode 100755
index dbc47d4..5d16a1b
--- a/src/shaders/post_processing/gen5_6/Common/NV12_Load_8x4.asm
+++ b/src/shaders/post_processing/gen5_6/Common/NV12_Load_8x4.asm
@@ -29,14 +29,42 @@
send (8) udSRC_U(0)<1> mMSGHDRU udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_UV:ud
// Convert to word-aligned format ----------------------------------------------
-#if !defined(LOAD_UV_ONLY)
+#if defined(FIX_POINT_CONVERSION) || defined(FLOAT_POINT_CONVERSION)
+ // load NV12 and save it as packed AYUV to dst (64x8)
+
+ $for (nY_NUM_OF_ROWS-1; >-1; -1) {
+ // #### Y
+ mov (8) ubDEST_Y(0,%1*16*4)<4> ubSRC_Y(0,%1*16)<0;8,1>
+ mov (8) ubDEST_Y(0,(%1*16+8)*4)<4> ubSRC_Y(0,%1*16+8)<0;8,1>
+
+ // #### U/V
+ // error from compile: "Invalid horiz size 8", so I have to repeat UV first
+ // mov (4) ubDEST_Y(0,%1*16*4+1)<8> ubSRC_U(0,%1/2*16)<0;4,2>
+ // mov (4) ubDEST_Y(0,%1*16*4+1+32)<8> ubSRC_U(0,%1/2*16+8)<0;4,2>
+
+ // repeate U/V for each one
+ mov (8) REG2(r,nTEMP18,0)<2>:uw uwSRC_U(0,%1/2*8)<0;8,1>
+ mov (8) REG2(r,nTEMP18,1)<2>:uw uwSRC_U(0,%1/2*8)<0;8,1>
+
+ // mov U/V to ubDEST
+ mov (8) ubDEST_Y(0,%1*16*4+1)<4> REG2(r,nTEMP18,0)<0;8,2>:ub
+ mov (8) ubDEST_Y(0,%1*16*4+1+32)<4> REG2(r,nTEMP18,16)<0;8,2>:ub
+
+ mov (8) ubDEST_Y(0,%1*16*4+2)<4> REG2(r,nTEMP18,1)<0;8,2>:ub
+ mov (8) ubDEST_Y(0,%1*16*4+2+32)<4> REG2(r,nTEMP18,17)<0;8,2>:ub
+ }
+#else
+ #if !defined(LOAD_UV_ONLY)
$for (nY_NUM_OF_ROWS-1; >-1; -1) {
mov (16) uwDEST_Y(0,%1*16)<1> ubSRC_Y(0,%1*16)
}
-#endif
+ #endif
$for (nUV_NUM_OF_ROWS/2-1; >-1; -1) {
+ // why "mov (16)"? should it be 8?
mov (16) uwDEST_U(0,%1*16)<1> ubSRC_U(0,%1*32)<32;16,2>
mov (16) uwDEST_V(0,%1*16)<1> ubSRC_U(0,%1*32+1)<32;16,2>
}
+#endif
+
// End of NV12_Load_8x4
diff --git a/src/shaders/post_processing/gen5_6/Common/PL2_Load.inc b/src/shaders/post_processing/gen5_6/Common/PL2_Load.inc
old mode 100644
new mode 100755
index 9feeba6..0003dae
--- a/src/shaders/post_processing/gen5_6/Common/PL2_Load.inc
+++ b/src/shaders/post_processing/gen5_6/Common/PL2_Load.inc
@@ -73,6 +73,9 @@
#define uwDEST_U uwTOP_U
#define uwDEST_V uwTOP_V
+#define ubDEST_Y ubTOP_Y // I'd like use them for color conversion
+#define uwSRC_U uwBOT_U
+
#define nSRC_REGION nREGION_1 // REGION_1 will be the source region for first kernel
// End of PL2_Load.inc
diff --git a/src/shaders/post_processing/gen5_6/Common/YUVX_Save_RGBX_Fix.asm b/src/shaders/post_processing/gen5_6/Common/YUVX_Save_RGBX_Fix.asm
new file mode 100755
index 0000000..6f4a881
--- /dev/null
+++ b/src/shaders/post_processing/gen5_6/Common/YUVX_Save_RGBX_Fix.asm
@@ -0,0 +1,178 @@
+/*
+ * All Video Processing kernels
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ * Authors:
+ * Halley Zhao <halley.zhao at intel.com>
+ */
+
+// Module name: YUVX_Save_RGBX_Fix.asm
+//----------------------------------------------------------------
+
+#include "RGBX_Load_16x8.inc"
+
+#if (0)
+ #define nTEMP0 34 // transformation coefficient
+ #define nTEMP1 35 // one row of R (first half register is used)
+ #define nTEMP2 36 // one row of G (first half register is used)
+ #define nTEMP3 37 // one row of B (first half register is used)
+ #define nTEMP4 38 // mul and add
+ #define nTEMP5 39 // mul and add
+ #define nTEMP6 40 // mul and add
+ #define nTEMP7 41 // mul and add
+ #define nTEMP8 42 // sum of mul
+ #define nTEMP10 44
+ #define nTEMP10 44 // split ub pixel to word width 1st quarter
+ #define nTEMP12 46 // split ub pixel to word width 2nd quarter
+ #define nTEMP14 48 // split ub pixel to word width 3rd quarter
+ #define nTEMP16 50 // split ub pixel to word width 4th quarter
+ #define nTEMP17 51
+ #define nTEMP18 52
+
+ #define nTEMP24 58 // temp using for repeat U/V in NV12_Load_8x4.asm
+#endif
+
+#define ONE_ROW_DEBUG 0
+
+#if (ONE_ROW_DEBUG)
+ #define ROW_NUM 0
+ #define DBG_ROWNUM_BASE 1
+#else
+ #define ROW_NUM %1
+ $for(0; <nY_NUM_OF_ROWS; 1) {
+#endif
+ // C = Y' - 16 D = U - 128 E = V - 128
+ add (16) REG2(r,nTEMP10,0)<1>:w ubDEST_RGBX(0,ROW_NUM*64 )<0;16,1> REG2(r,nTEMP0,24)<0;4,1>:b
+ add (16) REG2(r,nTEMP12,0)<1>:w ubDEST_RGBX(0,ROW_NUM*64+16)<0;16,1> REG2(r,nTEMP0,24)<0;4,1>:b
+ add (16) REG2(r,nTEMP14,0)<1>:w ubDEST_RGBX(0,ROW_NUM*64+32)<0;16,1> REG2(r,nTEMP0,24)<0;4,1>:b
+ add (16) REG2(r,nTEMP16,0)<1>:w ubDEST_RGBX(0,ROW_NUM*64+48)<0;16,1> REG2(r,nTEMP0,24)<0;4,1>:b
+
+#if (ONE_ROW_DEBUG)
+ mov (16) ubDEST_RGBX(0,(DBG_ROWNUM_BASE)*64 )<1> REG2(r,nTEMP10, 0)<0;16,2>:ub
+ mov (16) ubDEST_RGBX(0,(DBG_ROWNUM_BASE)*64+16)<1> REG2(r,nTEMP12, 0)<0;16,2>:ub
+ mov (16) ubDEST_RGBX(0,(DBG_ROWNUM_BASE)*64+32)<1> REG2(r,nTEMP14, 0)<0;16,2>:ub
+ mov (16) ubDEST_RGBX(0,(DBG_ROWNUM_BASE)*64+48)<1> REG2(r,nTEMP16, 0)<0;16,2>:ub
+#endif
+
+ // |Y|U|V|X|==>|R|G|B|X|
+ // ###### do one row for R
+ // #### mul and add
+ mul.sat (16) REG2(r, nTEMP4, 0)<1>:w REG2(r,nTEMP10,0)<0;16,1>:w REG2(r, nTEMP0, 0)<0;4,1>:w
+ mul.sat (16) REG2(r, nTEMP5, 0)<1>:w REG2(r,nTEMP12,0)<0;16,1>:w REG2(r, nTEMP0, 0)<0;4,1>:w
+ mul.sat (16) REG2(r, nTEMP6, 0)<1>:w REG2(r,nTEMP14,0)<0;16,1>:w REG2(r, nTEMP0, 0)<0;4,1>:w
+ mul.sat (16) REG2(r, nTEMP7, 0)<1>:w REG2(r,nTEMP16,0)<0;16,1>:w REG2(r, nTEMP0, 0)<0;4,1>:w
+
+ #if (ONE_ROW_DEBUG)
+ mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+1)*64+2 )<4> REG2(r,nTEMP0, 0)<0;8,1>:ub
+ mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+1)*64+2+32)<4> REG2(r,nTEMP0, 0)<0;8,1>:ub
+
+ mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+2)*64+2 )<4> REG2(r,nTEMP4, 0)<0;8,1>:ub
+ mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+2)*64+2+32)<4> REG2(r,nTEMP4, 8)<0;8,1>:ub
+ #endif
+
+ add.sat (4) REG2(r, nTEMP4, 0)<4>:uw REG2(r, nTEMP4, 0)<0;4,4>:w REG2(r, nTEMP4, 2)<0;4,4>:w
+ add.sat (4) REG2(r, nTEMP5, 0)<4>:uw REG2(r, nTEMP5, 0)<0;4,4>:w REG2(r, nTEMP5, 2)<0;4,4>:w
+ add.sat (4) REG2(r, nTEMP6, 0)<4>:uw REG2(r, nTEMP6, 0)<0;4,4>:w REG2(r, nTEMP6, 2)<0;4,4>:w
+ add.sat (4) REG2(r, nTEMP7, 0)<4>:uw REG2(r, nTEMP7, 0)<0;4,4>:w REG2(r, nTEMP7, 2)<0;4,4>:w
+
+ // #### write one row of R to rnTEMP1
+ mov (4) REG2(r, nTEMP8, 0)<1>:uw REG2(r, nTEMP4, 0)<0; 4, 4>:uw
+ mov (4) REG2(r, nTEMP8, 4)<1>:uw REG2(r, nTEMP5, 0)<0; 4, 4>:uw
+ mov (4) REG2(r, nTEMP8, 8)<1>:uw REG2(r, nTEMP6, 0)<0; 4, 4>:uw
+ mov (4) REG2(r, nTEMP8, 12)<1>:uw REG2(r, nTEMP7, 0)<0; 4, 4>:uw
+
+ #if (ONE_ROW_DEBUG)
+ mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+3)*64+2 )<4> REG2(r,nTEMP8, 0)<0;8,1>:ub
+ mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+3)*64+2+32)<4> REG2(r,nTEMP8, 8)<0;8,1>:ub
+ #endif
+
+ add.sat (16) REG2(r, nTEMP8, 0)<1>:uw REG2(r, nTEMP8, 0)<0; 16, 1>:uw 0x80:uw // todo, combine mov and add
+ shl.sat (16) REG2(r, nTEMP8, 0)<1>:uw REG2(r, nTEMP8, 0)<0; 16, 1>:uw 1:w
+ mov (16) REG2(r, nTEMP1, 0)<1>:ub REG2(r, nTEMP8, 1)<0; 16, 2>:ub
+
+ #if (ONE_ROW_DEBUG)
+ mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+4)*64+2 )<4> REG2(r,nTEMP8, 0)<0;8,1>:ub
+ mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+4)*64+2+32)<4> REG2(r,nTEMP8, 8)<0;8,1>:ub
+ #endif
+ // ###### do one row for G
+ // #### mul and add
+ mul.sat (16) REG2(r, nTEMP4, 0)<1>:w REG2(r,nTEMP10,0)<0;16,1>:w REG2(r, nTEMP0, 4)<0;4,1>:w
+ mul.sat (16) REG2(r, nTEMP5, 0)<1>:w REG2(r,nTEMP12,0)<0;16,1>:w REG2(r, nTEMP0, 4)<0;4,1>:w
+ mul.sat (16) REG2(r, nTEMP6, 0)<1>:w REG2(r,nTEMP14,0)<0;16,1>:w REG2(r, nTEMP0, 4)<0;4,1>:w
+ mul.sat (16) REG2(r, nTEMP7, 0)<1>:w REG2(r,nTEMP16,0)<0;16,1>:w REG2(r, nTEMP0, 4)<0;4,1>:w
+
+ add.sat (4) REG2(r, nTEMP4, 0)<4>:uw REG2(r, nTEMP4, 0)<0;4,4>:w REG2(r, nTEMP4, 1)<0;4,4>:w
+ add.sat (4) REG2(r, nTEMP4, 0)<4>:uw REG2(r, nTEMP4, 0)<0;4,4>:uw REG2(r, nTEMP4, 2)<0;4,4>:w
+ add.sat (4) REG2(r, nTEMP5, 0)<4>:uw REG2(r, nTEMP5, 0)<0;4,4>:w REG2(r, nTEMP5, 1)<0;4,4>:w
+ add.sat (4) REG2(r, nTEMP5, 0)<4>:uw REG2(r, nTEMP5, 0)<0;4,4>:uw REG2(r, nTEMP5, 2)<0;4,4>:w
+ add.sat (4) REG2(r, nTEMP6, 0)<4>:uw REG2(r, nTEMP6, 0)<0;4,4>:w REG2(r, nTEMP6, 1)<0;4,4>:w
+ add.sat (4) REG2(r, nTEMP6, 0)<4>:uw REG2(r, nTEMP6, 0)<0;4,4>:uw REG2(r, nTEMP6, 2)<0;4,4>:w
+ add.sat (4) REG2(r, nTEMP7, 0)<4>:uw REG2(r, nTEMP7, 0)<0;4,4>:w REG2(r, nTEMP7, 1)<0;4,4>:w
+ add.sat (4) REG2(r, nTEMP7, 0)<4>:uw REG2(r, nTEMP7, 0)<0;4,4>:uw REG2(r, nTEMP7, 2)<0;4,4>:w
+
+ // #### write one row of G to rnTEMP2
+ mov (4) REG2(r, nTEMP8, 0)<1>:uw REG2(r, nTEMP4, 0)<0; 4, 4>:uw
+ mov (4) REG2(r, nTEMP8, 4)<1>:uw REG2(r, nTEMP5, 0)<0; 4, 4>:uw
+ mov (4) REG2(r, nTEMP8, 8)<1>:uw REG2(r, nTEMP6, 0)<0; 4, 4>:uw
+ mov (4) REG2(r, nTEMP8, 12)<1>:uw REG2(r, nTEMP7, 0)<0; 4, 4>:uw
+
+ add (16) REG2(r, nTEMP8, 0)<1>:uw REG2(r, nTEMP8, 0)<0; 16, 1>:uw 0x80:uw // saturation
+ shl.sat (16) REG2(r, nTEMP8, 0)<1>:uw REG2(r, nTEMP8, 0)<0; 16, 1>:uw 1:w
+ mov (16) REG2(r, nTEMP2, 0)<1>:ub REG2(r, nTEMP8, 1)<0; 16, 2>:ub
+
+ // ###### do one row for B
+ // #### mul and add
+ mul.sat (16) REG2(r, nTEMP4, 0)<1>:w REG2(r,nTEMP10,0)<0;16,1>:w REG2(r, nTEMP0, 8)<0;4,1>:w
+ mul.sat (16) REG2(r, nTEMP5, 0)<1>:w REG2(r,nTEMP12,0)<0;16,1>:w REG2(r, nTEMP0, 8)<0;4,1>:w
+ mul.sat (16) REG2(r, nTEMP6, 0)<1>:w REG2(r,nTEMP14,0)<0;16,1>:w REG2(r, nTEMP0, 8)<0;4,1>:w
+ mul.sat (16) REG2(r, nTEMP7, 0)<1>:w REG2(r,nTEMP16,0)<0;16,1>:w REG2(r, nTEMP0, 8)<0;4,1>:w
+
+ #if (ONE_ROW_DEBUG)
+ mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+1)*64 )<4> REG2(r,nTEMP0, 16)<0;8,1>:ub
+ mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+1)*64+32)<4> REG2(r,nTEMP0, 16)<0;8,1>:ub
+
+ mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+2)*64 )<4> REG2(r,nTEMP4, 0)<0;8,1>:ub
+ mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+2)*64+32)<4> REG2(r,nTEMP4, 8)<0;8,1>:ub
+ #endif
+
+ add.sat (4) REG2(r, nTEMP4, 0)<4>:uw REG2(r, nTEMP4, 0)<0;4,4>:w REG2(r, nTEMP4, 1)<0;4,4>:w
+ add.sat (4) REG2(r, nTEMP5, 0)<4>:uw REG2(r, nTEMP5, 0)<0;4,4>:w REG2(r, nTEMP5, 1)<0;4,4>:w
+ add.sat (4) REG2(r, nTEMP6, 0)<4>:uw REG2(r, nTEMP6, 0)<0;4,4>:w REG2(r, nTEMP6, 1)<0;4,4>:w
+ add.sat (4) REG2(r, nTEMP7, 0)<4>:uw REG2(r, nTEMP7, 0)<0;4,4>:w REG2(r, nTEMP7, 1)<0;4,4>:w
+
+ // #### write one row of B to rnTEMP3
+ mov (4) REG2(r, nTEMP8, 0)<1>:uw REG2(r, nTEMP4, 0)<0; 4, 4>:uw
+ mov (4) REG2(r, nTEMP8, 4)<1>:uw REG2(r, nTEMP5, 0)<0; 4, 4>:uw
+ mov (4) REG2(r, nTEMP8, 8)<1>:uw REG2(r, nTEMP6, 0)<0; 4, 4>:uw
+ mov (4) REG2(r, nTEMP8, 12)<1>:uw REG2(r, nTEMP7, 0)<0; 4, 4>:uw
+
+ #if (ONE_ROW_DEBUG)
+ mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+3)*64 )<4> REG2(r,nTEMP8, 0)<0;8,1>:ub
+ mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+3)*64+32)<4> REG2(r,nTEMP8, 8)<0;8,1>:ub
+ #endif
+
+ add.sat (16) REG2(r, nTEMP8, 0)<1>:uw REG2(r, nTEMP8, 0)<0; 16, 1>:uw 0x80:uw // saturation
+ shl.sat (16) REG2(r, nTEMP8, 0)<1>:uw REG2(r, nTEMP8, 0)<0; 16, 1>:uw 1:w
+ mov (16) REG2(r, nTEMP3, 0)<1>:ub REG2(r, nTEMP8, 1)<0; 16, 2>:ub
+
+ #if (ONE_ROW_DEBUG)
+ mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+4)*64 )<4> REG2(r,nTEMP8, 0)<0;8,1>:ub
+ mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+4)*64+32)<4> REG2(r,nTEMP8, 8)<0;8,1>:ub
+ #endif
+
+ // B
+ mov (8) ubDEST_RGBX(0,ROW_NUM*64 )<4> REG2(r,nTEMP3, 0)<0;8,1>:ub
+ mov (8) ubDEST_RGBX(0,ROW_NUM*64+32)<4> REG2(r,nTEMP3, 8)<0;8,1>:ub
+ // G
+ mov (8) ubDEST_RGBX(0,ROW_NUM*64+1 )<4> REG2(r,nTEMP2, 0)<0;8,1>:ub
+ mov (8) ubDEST_RGBX(0,ROW_NUM*64+1+32)<4> REG2(r,nTEMP2, 8)<0;8,1>:ub
+ // R
+ mov (8) ubDEST_RGBX(0,ROW_NUM*64+2 )<4> REG2(r,nTEMP1, 0)<0;8,1>:ub
+ mov (8) ubDEST_RGBX(0,ROW_NUM*64+2+32)<4> REG2(r,nTEMP1, 8)<0;8,1>:ub
+#if (!ONE_ROW_DEBUG)
+ }
+#endif
diff --git a/src/shaders/post_processing/gen5_6/Common/YUVX_Save_RGBX_Float.asm b/src/shaders/post_processing/gen5_6/Common/YUVX_Save_RGBX_Float.asm
new file mode 100755
index 0000000..f7585d4
--- /dev/null
+++ b/src/shaders/post_processing/gen5_6/Common/YUVX_Save_RGBX_Float.asm
@@ -0,0 +1,287 @@
+/*
+ * All Video Processing kernels
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ * Authors:
+ * Halley Zhao <halley.zhao at intel.com>
+ */
+
+// Module name: YUVX_Save_RGBX_Float.asm
+//----------------------------------------------------------------
+
+#include "RGBX_Load_16x8.inc"
+
+#if (0)
+ // 8 grf reg for one row of pixel (2 pixel per grf)
+ #define nTEMP0 34
+ #define nTEMP1 35
+ #define nTEMP2 36
+ #define nTEMP3 37
+ #define nTEMP4 38
+ #define nTEMP5 39
+ #define nTEMP6 40
+ #define nTEMP7 41
+
+ #define nTEMP8 42 // transformation coefficient
+ #define nTEMP10 44 // transformation coefficient
+
+ #define nTEMP12 46 // save Y/U/V in ub format
+ #define nTEMP14 48 // save YUV in ud format
+ #define nTEMP16 50 // dp4 result
+ #define nTEMP17 51
+ #define nTEMP18 52 // temp used for repeat U/V in NV12_Load_8x4.asm
+
+ #define nTEMP24 58 // it is not safe to use in my case. I try to use it for repeat U/V in NV12_Load_8x4.asm, Y data is taint in row 4/5
+#endif
+
+#define ONE_ROW_DEBUG 0
+
+#if (ONE_ROW_DEBUG)
+ // if you want to debug a row which is not the first one, try the following:
+ // 1. define ROW_NUM_READ to the row you want to debug
+ // 2. ROW_NUM_WRITE can be same to DBG_ROWNUM_READ to overwrite original YUV data, or define it to a new row
+ // 3. change (DBG_ROWNUM_BASE+?)=ROW_NUM_READ or ROW_NUM_WRITE to DBG_ROWNUM_0, to not conflict with others
+ #define ROW_NUM_READ 0
+ #define ROW_NUM_WRITE 0
+ #define DBG_ROWNUM_BASE 1
+ #define DBG_ROWNUM_0 0
+#else
+ #define ROW_NUM_READ %1
+ #define ROW_NUM_WRITE %1
+ $for(0; <nY_NUM_OF_ROWS; 1) {
+#endif
+ // YUVX | Y | U | V | X |
+ // XRGB | B | G | R | X |
+ // ###### save one row of pixel to temp grf with float format (required by dp4)
+ // C = Y' - 16 D = U - 128 E = V - 128
+
+ // the follow sentence doesn't work, I have to split it into two step
+ // add (4) REG(r, nTEMP0)<1>:f r[SRC_RGBA_OFFSET_1,ROW_NUM_READ*32 + 0]<4,1>:ub REG2(r, nTEMP10, 16)<0;4,1>:b
+
+ add (16) REG2(r,nTEMP12,0)<1>:w ubDEST_RGBX(0,ROW_NUM_READ*64 )<0;16,1> REG2(r,nTEMP10,16)<0;4,1>:b
+ add (16) REG2(r,nTEMP14,0)<1>:w ubDEST_RGBX(0,ROW_NUM_READ*64+16)<0;16,1> REG2(r,nTEMP10,16)<0;4,1>:b
+ add (16) REG2(r,nTEMP16,0)<1>:w ubDEST_RGBX(0,ROW_NUM_READ*64+32)<0;16,1> REG2(r,nTEMP10,16)<0;4,1>:b
+ add (16) REG2(r,nTEMP17,0)<1>:w ubDEST_RGBX(0,ROW_NUM_READ*64+48)<0;16,1> REG2(r,nTEMP10,16)<0;4,1>:b
+
+ mov (8) ufROW_YUVA(0,0)<1> REG2(r, nTEMP12, 0)<0;8,1>:w
+ mov (8) ufROW_YUVA(1,0)<1> REG2(r, nTEMP12, 8)<0;8,1>:w
+ mov (8) ufROW_YUVA(2,0)<1> REG2(r, nTEMP14, 0)<0;8,1>:w
+ mov (8) ufROW_YUVA(3,0)<1> REG2(r, nTEMP14, 8)<0;8,1>:w
+ mov (8) ufROW_YUVA(4,0)<1> REG2(r, nTEMP16, 0)<0;8,1>:w
+ mov (8) ufROW_YUVA(5,0)<1> REG2(r, nTEMP16, 8)<0;8,1>:w
+ mov (8) ufROW_YUVA(6,0)<1> REG2(r, nTEMP17, 0)<0;8,1>:w
+ mov (8) ufROW_YUVA(7,0)<1> REG2(r, nTEMP17, 8)<0;8,1>:w
+
+ #if (ONE_ROW_DEBUG)
+ mov.sat (8) REG2(r, nTEMP14, 0)<1>:ud ufROW_YUVA(0,0)<0;8,1>:f
+ mov (8) REG2(r, nTEMP12, 0)<1>:ub REG2(r, nTEMP14, 0)<0;8,4>:ub
+
+ // write Y-16, U-128, V-128 to the 2nd row of RGB (convert float to int first, write whole ud): 1st half, 2 pixels
+ mov (16) ubDEST_RGBX(0,(DBG_ROWNUM_BASE)*64 )<1> REG2(r,nTEMP14, 0)<0;16,1>:ub
+ mov (16) ubDEST_RGBX(0,(DBG_ROWNUM_BASE)*64+16)<1> REG2(r,nTEMP14, 16)<0;16,1>:ub
+
+ mov.sat (8) REG2(r, nTEMP14, 0)<1>:ud ufROW_YUVA(1,0)<0;8,1>:f
+ mov (8) REG2(r, nTEMP12, 8)<1>:ub REG2(r, nTEMP14, 0)<0;8,4>:ub
+
+ // write Y-16, U-128, V-128 to the 2nd row of RGB (convert float to int first, write whole ud): 2nd half, 2 pixels
+ mov (16) ubDEST_RGBX(0,(DBG_ROWNUM_BASE)*64+32)<1> REG2(r,nTEMP14, 0)<0;16,1>:ub
+ mov (16) ubDEST_RGBX(0,(DBG_ROWNUM_BASE)*64+48)<1> REG2(r,nTEMP14, 16)<0;16,1>:ub
+
+ mov.sat (8) REG2(r, nTEMP14, 0)<1>:ud ufROW_YUVA(2,0)<0;8,1>:f
+ mov (8) REG2(r, nTEMP12, 16)<1>:ub REG2(r, nTEMP14, 0)<0;8,4>:ub
+
+ mov.sat (8) REG2(r, nTEMP14, 0)<1>:ud ufROW_YUVA(3,0)<0;8,1>:f
+ mov (8) REG2(r, nTEMP12, 24)<1>:ub REG2(r, nTEMP14, 0)<0;8,4>:ub
+
+ // write Y-16, U-128, V-128 to the 3rd row of RGB (convert float to int first, only LSB is used): 1st half, 8 pixels
+ mov (16) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+1)*64 )<1> REG2(r,nTEMP12, 0)<0;16,1>:ub
+ mov (16) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+1)*64+16)<1> REG2(r,nTEMP12, 16)<0;16,1>:ub
+
+ mov.sat (8) REG2(r, nTEMP14, 0)<1>:ud ufROW_YUVA(4,0)<0;8,1>:f
+ mov (8) REG2(r, nTEMP12, 0)<1>:ub REG2(r, nTEMP14, 0)<0;8,4>:ub
+
+ mov.sat (8) REG2(r, nTEMP14, 0)<1>:ud ufROW_YUVA(5,0)<0;8,1>:f
+ mov (8) REG2(r, nTEMP12, 8)<1>:ub REG2(r, nTEMP14, 0)<0;8,4>:ub
+
+ mov.sat (8) REG2(r, nTEMP14, 0)<1>:ud ufROW_YUVA(6,0)<0;8,1>:f
+ mov (8) REG2(r, nTEMP12, 16)<1>:ub REG2(r, nTEMP14, 0)<0;8,4>:ub
+
+ mov.sat (8) REG2(r, nTEMP14, 0)<1>:ud ufROW_YUVA(7,0)<0;8,1>:f
+ mov (8) REG2(r, nTEMP12, 24)<1>:ub REG2(r, nTEMP14, 0)<0;8,4>:ub
+
+ // write Y-16, U-128, V-128 to the 3rd row of RGB (convert float to int first, only LSB is used): 2nd half, 8 pixels
+ mov (16) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+1)*64+32)<1> REG2(r,nTEMP12, 0)<0;16,1>:ub
+ mov (16) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+1)*64+48)<1> REG2(r,nTEMP12, 16)<0;16,1>:ub
+ #endif
+
+ // ######## do one row for Red ########
+ #define COEF_REG REG2(r, nTEMP8, 0) // reg for Red coefficient
+ #define CHANNEL 2
+ // ##### dp4(nTEMP16) and save result to uw format(nTEMP12)
+ dp4 (8) REG2(r, nTEMP16, 0)<1>:f ufROW_YUVA(0, 0)<0;8,1> COEF_REG<0;4,1>:f
+ mov.sat (2) REG2(r, nTEMP14, 0)<1>:ud REG2(r, nTEMP16, 0)<0;2,4>:f
+ mov (2) REG2(r, nTEMP12, 0)<1>:ub REG2(r, nTEMP14, 0)<0;2,4>:ub // todo for saturation
+
+ #if (ONE_ROW_DEBUG)
+ // write dp4 (raw float) of 2 pixel to the 4/5th row
+ mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+2)*64+CHANNEL )<4> REG2(r,nTEMP16, 0)<0;8,1>:ub
+ mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+2)*64+CHANNEL+32)<4> REG2(r,nTEMP16, 8)<0;8,1>:ub
+ mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+3)*64+CHANNEL )<4> REG2(r,nTEMP16, 16)<0;8,1>:ub
+ mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+3)*64+CHANNEL+32)<4> REG2(r,nTEMP16, 24)<0;8,1>:ub
+
+ // write dp4 (convert float to ud first, write whole ud) of 2 pixel to the 6/7th row
+ mov (8) REG2(r, nTEMP17, 0)<1>:d REG2(r, nTEMP16, 0)<0;8,1>:f
+ mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+4)*64+CHANNEL )<4> REG2(r,nTEMP17, 0)<0;8,1>:ub
+ mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+4)*64+CHANNEL+32)<4> REG2(r,nTEMP17, 8)<0;8,1>:ub
+ mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+5)*64+CHANNEL )<4> REG2(r,nTEMP17, 16)<0;8,1>:ub
+ mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+5)*64+CHANNEL+32)<4> REG2(r,nTEMP17, 24)<0;8,1>:ub
+ #endif
+
+ dp4 (8) REG2(r, nTEMP16, 0)<1>:f ufROW_YUVA(1, 0)<0;8,1> COEF_REG<0;4,1>:f
+ mov.sat (2) REG2(r, nTEMP14, 0)<1>:ud REG2(r, nTEMP16, 0)<0;2,4>:f
+ mov (2) REG2(r, nTEMP12, 2)<1>:ub REG2(r, nTEMP14, 0)<0;2,4>:ub
+
+ dp4 (8) REG2(r, nTEMP16, 0)<1>:f ufROW_YUVA(2, 0)<0;8,1> COEF_REG<0;4,1>:f
+ mov.sat (2) REG2(r, nTEMP14, 0)<1>:ud REG2(r, nTEMP16, 0)<0;2,4>:f
+ mov (2) REG2(r, nTEMP12, 4)<1>:ub REG2(r, nTEMP14, 0)<0;2,4>:ub
+
+ dp4 (8) REG2(r, nTEMP16, 0)<1>:f ufROW_YUVA(3, 0)<0;8,1> COEF_REG<0;4,1>:f
+ mov.sat (2) REG2(r, nTEMP14, 0)<1>:ud REG2(r, nTEMP16, 0)<0;2,4>:f
+ mov (2) REG2(r, nTEMP12, 6)<1>:ub REG2(r, nTEMP14, 0)<0;2,4>:ub
+
+ dp4 (8) REG2(r, nTEMP16, 0)<1>:f ufROW_YUVA(4, 0)<0;8,1> COEF_REG<0;4,1>:f
+ mov.sat (2) REG2(r, nTEMP14, 0)<1>:ud REG2(r, nTEMP16, 0)<0;2,4>:f
+ mov (2) REG2(r, nTEMP12, 8)<1>:ub REG2(r, nTEMP14, 0)<0;2,4>:ub
+
+ dp4 (8) REG2(r, nTEMP16, 0)<1>:f ufROW_YUVA(5, 0)<5;8,1> COEF_REG<0;4,1>:f
+ mov.sat (2) REG2(r, nTEMP14, 0)<1>:ud REG2(r, nTEMP16, 0)<0;2,4>:f
+ mov (2) REG2(r, nTEMP12, 10)<1>:ub REG2(r, nTEMP14, 0)<0;2,4>:ub
+
+ dp4 (8) REG2(r, nTEMP16, 0)<1>:f ufROW_YUVA(6, 0)<6;8,1> COEF_REG<0;4,1>:f
+ mov.sat (2) REG2(r, nTEMP14, 0)<1>:ud REG2(r, nTEMP16, 0)<0;2,4>:f
+ mov (2) REG2(r, nTEMP12, 12)<1>:ub REG2(r, nTEMP14, 0)<0;2,4>:ub
+
+ dp4 (8) REG2(r, nTEMP16, 0)<1>:f ufROW_YUVA(7, 0)<0;8,1> COEF_REG<0;4,1>:f
+ mov.sat (2) REG2(r, nTEMP14, 0)<1>:ud REG2(r, nTEMP16, 0)<0;2,4>:f
+ mov (2) REG2(r, nTEMP12, 14)<1>:ub REG2(r, nTEMP14, 0)<0;2,4>:ub
+
+ // #### write this channel
+ mov (8) ubDEST_RGBX(0,ROW_NUM_WRITE*64+CHANNEL )<4> REG2(r,nTEMP12, 0)<0;8,1>:ub
+ mov (8) ubDEST_RGBX(0,ROW_NUM_WRITE*64+CHANNEL+32)<4> REG2(r,nTEMP12, 8)<0;8,1>:ub
+
+ // ######## do one row for Green ########
+ #define COEF_REG REG2(r, nTEMP8, 4) // reg for green coefficient
+ #define CHANNEL 1
+ // ##### dp4(nTEMP16) and save result to uw format(nTEMP12)
+ dp4 (8) REG2(r, nTEMP16, 0)<1>:f ufROW_YUVA(0, 0)<0;8,1> COEF_REG<0;4,1>:f
+ mov.sat (2) REG2(r, nTEMP14, 0)<1>:ud REG2(r, nTEMP16, 0)<0;2,4>:f
+ mov (2) REG2(r, nTEMP12, 0)<1>:ub REG2(r, nTEMP14, 0)<0;2,4>:ub // todo for saturation
+
+ #if (ONE_ROW_DEBUG)
+ // write dp4 (raw float) of 2 pixel to the 4/5th row
+ mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+2)*64+CHANNEL )<4> REG2(r,nTEMP16, 0)<0;8,1>:ub
+ mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+2)*64+CHANNEL+32)<4> REG2(r,nTEMP16, 8)<0;8,1>:ub
+ mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+3)*64+CHANNEL )<4> REG2(r,nTEMP16, 16)<0;8,1>:ub
+ mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+3)*64+CHANNEL+32)<4> REG2(r,nTEMP16, 24)<0;8,1>:ub
+
+ // write dp4 (convert float to ud first, write whole ud) of 2 pixel to the 6/7th row
+ mov (8) REG2(r, nTEMP17, 0)<1>:d REG2(r, nTEMP16, 0)<0;8,1>:f
+ mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+4)*64+CHANNEL )<4> REG2(r,nTEMP17, 0)<0;8,1>:ub
+ mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+4)*64+CHANNEL+32)<4> REG2(r,nTEMP17, 8)<0;8,1>:ub
+ mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+5)*64+CHANNEL )<4> REG2(r,nTEMP17, 16)<0;8,1>:ub
+ mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+5)*64+CHANNEL+32)<4> REG2(r,nTEMP17, 24)<0;8,1>:ub
+ #endif
+
+ dp4 (8) REG2(r, nTEMP16, 0)<1>:f ufROW_YUVA(1, 0)<0;8,1> COEF_REG<0;4,1>:f
+ mov.sat (2) REG2(r, nTEMP14, 0)<1>:ud REG2(r, nTEMP16, 0)<0;2,4>:f
+ mov (2) REG2(r, nTEMP12, 2)<1>:ub REG2(r, nTEMP14, 0)<0;2,4>:ub
+
+ dp4 (8) REG2(r, nTEMP16, 0)<1>:f ufROW_YUVA(2, 0)<0;8,1> COEF_REG<0;4,1>:f
+ mov.sat (2) REG2(r, nTEMP14, 0)<1>:ud REG2(r, nTEMP16, 0)<0;2,4>:f
+ mov (2) REG2(r, nTEMP12, 4)<1>:ub REG2(r, nTEMP14, 0)<0;2,4>:ub
+
+ dp4 (8) REG2(r, nTEMP16, 0)<1>:f ufROW_YUVA(3, 0)<0;8,1> COEF_REG<0;4,1>:f
+ mov.sat (2) REG2(r, nTEMP14, 0)<1>:ud REG2(r, nTEMP16, 0)<0;2,4>:f
+ mov (2) REG2(r, nTEMP12, 6)<1>:ub REG2(r, nTEMP14, 0)<0;2,4>:ub
+
+ dp4 (8) REG2(r, nTEMP16, 0)<1>:f ufROW_YUVA(4, 0)<0;8,1> COEF_REG<0;4,1>:f
+ mov.sat (2) REG2(r, nTEMP14, 0)<1>:ud REG2(r, nTEMP16, 0)<0;2,4>:f
+ mov (2) REG2(r, nTEMP12, 8)<1>:ub REG2(r, nTEMP14, 0)<0;2,4>:ub
+
+ dp4 (8) REG2(r, nTEMP16, 0)<1>:f ufROW_YUVA(5, 0)<5;8,1> COEF_REG<0;4,1>:f
+ mov.sat (2) REG2(r, nTEMP14, 0)<1>:ud REG2(r, nTEMP16, 0)<0;2,4>:f
+ mov (2) REG2(r, nTEMP12, 10)<1>:ub REG2(r, nTEMP14, 0)<0;2,4>:ub
+
+ dp4 (8) REG2(r, nTEMP16, 0)<1>:f ufROW_YUVA(6, 0)<6;8,1> COEF_REG<0;4,1>:f
+ mov.sat (2) REG2(r, nTEMP14, 0)<1>:ud REG2(r, nTEMP16, 0)<0;2,4>:f
+ mov (2) REG2(r, nTEMP12, 12)<1>:ub REG2(r, nTEMP14, 0)<0;2,4>:ub
+
+ dp4 (8) REG2(r, nTEMP16, 0)<1>:f ufROW_YUVA(7, 0)<0;8,1> COEF_REG<0;4,1>:f
+ mov.sat (2) REG2(r, nTEMP14, 0)<1>:ud REG2(r, nTEMP16, 0)<0;2,4>:f
+ mov (2) REG2(r, nTEMP12, 14)<1>:ub REG2(r, nTEMP14, 0)<0;2,4>:ub
+
+ // #### write this channel
+ mov (8) ubDEST_RGBX(0,ROW_NUM_WRITE*64+CHANNEL )<4> REG2(r,nTEMP12, 0)<0;8,1>:ub
+ mov (8) ubDEST_RGBX(0,ROW_NUM_WRITE*64+CHANNEL+32)<4> REG2(r,nTEMP12, 8)<0;8,1>:ub
+
+ // ###### do one row for Blue channel
+ #define COEF_REG REG2(r, nTEMP10, 0) // reg for Blue coefficient
+ #define CHANNEL 0
+ // ##### dp4(nTEMP16) and save result to uw format(nTEMP12)
+ dp4 (8) REG2(r, nTEMP16, 0)<1>:f ufROW_YUVA(0, 0)<0;8,1> COEF_REG<0;4,1>:f
+ mov.sat (2) REG2(r, nTEMP14, 0)<1>:ud REG2(r, nTEMP16, 0)<0;2,4>:f
+ mov (2) REG2(r, nTEMP12, 0)<1>:ub REG2(r, nTEMP14, 0)<0;2,4>:ub // todo for saturation
+
+ #if (ONE_ROW_DEBUG)
+ // write dp4 (raw float) of 2 pixel to the 4/5th row
+ mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+2)*64+CHANNEL )<4> REG2(r,nTEMP16, 0)<0;8,1>:ub
+ mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+2)*64+CHANNEL+32)<4> REG2(r,nTEMP16, 8)<0;8,1>:ub
+ mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+3)*64+CHANNEL )<4> REG2(r,nTEMP16, 16)<0;8,1>:ub
+ mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+3)*64+CHANNEL+32)<4> REG2(r,nTEMP16, 24)<0;8,1>:ub
+
+ // write dp4 (convert float to ud first, write whole ud) of 2 pixel to the 6/7th row
+ mov (8) REG2(r, nTEMP17, 0)<1>:d REG2(r, nTEMP16, 0)<0;8,1>:f
+ mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+4)*64+CHANNEL )<4> REG2(r,nTEMP17, 0)<0;8,1>:ub
+ mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+4)*64+CHANNEL+32)<4> REG2(r,nTEMP17, 8)<0;8,1>:ub
+ mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+5)*64+CHANNEL )<4> REG2(r,nTEMP17, 16)<0;8,1>:ub
+ mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+5)*64+CHANNEL+32)<4> REG2(r,nTEMP17, 24)<0;8,1>:ub
+ #endif
+
+ dp4 (8) REG2(r, nTEMP16, 0)<1>:f ufROW_YUVA(1, 0)<0;8,1> COEF_REG<0;4,1>:f
+ mov.sat (2) REG2(r, nTEMP14, 0)<1>:ud REG2(r, nTEMP16, 0)<0;2,4>:f
+ mov (2) REG2(r, nTEMP12, 2)<1>:ub REG2(r, nTEMP14, 0)<0;2,4>:ub
+
+ dp4 (8) REG2(r, nTEMP16, 0)<1>:f ufROW_YUVA(2, 0)<0;8,1> COEF_REG<0;4,1>:f
+ mov.sat (2) REG2(r, nTEMP14, 0)<1>:ud REG2(r, nTEMP16, 0)<0;2,4>:f
+ mov (2) REG2(r, nTEMP12, 4)<1>:ub REG2(r, nTEMP14, 0)<0;2,4>:ub
+
+ dp4 (8) REG2(r, nTEMP16, 0)<1>:f ufROW_YUVA(3, 0)<0;8,1> COEF_REG<0;4,1>:f
+ mov.sat (2) REG2(r, nTEMP14, 0)<1>:ud REG2(r, nTEMP16, 0)<0;2,4>:f
+ mov (2) REG2(r, nTEMP12, 6)<1>:ub REG2(r, nTEMP14, 0)<0;2,4>:ub
+
+ dp4 (8) REG2(r, nTEMP16, 0)<1>:f ufROW_YUVA(4, 0)<0;8,1> COEF_REG<0;4,1>:f
+ mov.sat (2) REG2(r, nTEMP14, 0)<1>:ud REG2(r, nTEMP16, 0)<0;2,4>:f
+ mov (2) REG2(r, nTEMP12, 8)<1>:ub REG2(r, nTEMP14, 0)<0;2,4>:ub
+
+ dp4 (8) REG2(r, nTEMP16, 0)<1>:f ufROW_YUVA(5, 0)<5;8,1> COEF_REG<0;4,1>:f
+ mov.sat (2) REG2(r, nTEMP14, 0)<1>:ud REG2(r, nTEMP16, 0)<0;2,4>:f
+ mov (2) REG2(r, nTEMP12, 10)<1>:ub REG2(r, nTEMP14, 0)<0;2,4>:ub
+
+ dp4 (8) REG2(r, nTEMP16, 0)<1>:f ufROW_YUVA(6, 0)<6;8,1> COEF_REG<0;4,1>:f
+ mov.sat (2) REG2(r, nTEMP14, 0)<1>:ud REG2(r, nTEMP16, 0)<0;2,4>:f
+ mov (2) REG2(r, nTEMP12, 12)<1>:ub REG2(r, nTEMP14, 0)<0;2,4>:ub
+
+ dp4 (8) REG2(r, nTEMP16, 0)<1>:f ufROW_YUVA(7, 0)<0;8,1> COEF_REG<0;4,1>:f
+ mov.sat (2) REG2(r, nTEMP14, 0)<1>:ud REG2(r, nTEMP16, 0)<0;2,4>:f
+ mov (2) REG2(r, nTEMP12, 14)<1>:ub REG2(r, nTEMP14, 0)<0;2,4>:ub
+
+ // #### write this channel
+ mov (8) ubDEST_RGBX(0,ROW_NUM_WRITE*64+CHANNEL )<4> REG2(r,nTEMP12, 0)<0;8,1>:ub
+ mov (8) ubDEST_RGBX(0,ROW_NUM_WRITE*64+CHANNEL+32)<4> REG2(r,nTEMP12, 8)<0;8,1>:ub
+#if (!ONE_ROW_DEBUG)
+ }
+#endif
diff --git a/src/shaders/post_processing/gen5_6/Common/YUV_to_RGBX_Coef.asm b/src/shaders/post_processing/gen5_6/Common/YUV_to_RGBX_Coef.asm
new file mode 100755
index 0000000..50c0a39
--- /dev/null
+++ b/src/shaders/post_processing/gen5_6/Common/YUV_to_RGBX_Coef.asm
@@ -0,0 +1,65 @@
+/*
+ * All Video Processing kernels
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ * Authors:
+ * Halley Zhao <halley.zhao at intel.com>
+ */
+
+// Module name: YUV_to_RGBX_Coef.asm
+//----------------------------------------------------------------
+#define ubDEST_RGBX ubTOP_Y // I'd like use them for color conversion
+
+#ifdef FIX_POINT_CONVERSION
+ // ###### set up transformation coefficient
+ // R = clip(( 298 * C + 0 * D + 409 * E + 128) >> 8)
+ // R = clip((0x012A * C + 0 * D + 0x0199 * E + 128) >> 8)
+ mov (1) REG2(r, nTEMP0, 0):ud 0x0000012A:ud
+ mov (1) REG2(r, nTEMP0, 1):ud 0x00000199:ud
+ // G = clip(( 298 * C - 100 * D - 208 * E + 128) >> 8)
+ // G = clip(( 0x012A * C - 0x64 * D - 0xD0 * E + 128) >> 8)
+ // G = clip(( 0x012A * C + 0xFF9C * D + 0xFF30 * E + 128) >> 8)
+ mov (1) REG2(r, nTEMP0, 2):ud 0xFF9C012A:ud
+ mov (1) REG2(r, nTEMP0, 3):ud 0x0000FF30:ud
+ // B = clip(( 298 * C + 516 * D + 0 * E + 128) >> 8)
+ // B = clip((0x012A* C + 0x0204 * D + 0 * E + 128) >> 8)
+ mov (1) REG2(r, nTEMP0, 4):ud 0x0204012A:ud
+ mov (1) REG2(r, nTEMP0, 5):ud 0x00000000:ud
+
+ // asr.sat (24) REG2(r,nTEMP0,0)<1> REG2(r,nTEMP0,0)<0;24,1> 1:w
+ asr.sat (8) REG2(r,nTEMP0, 0)<1>:w REG2(r,nTEMP0, 0)<0;8,1>:w 1:w
+ asr.sat (4) REG2(r,nTEMP0,8)<1>:w REG2(r,nTEMP0,8)<0;4,1>:w 1:w
+
+ // C = Y' - 16 D = U - 128 E = V - 128
+ mov (1) REG2(r, nTEMP0, 6):ud 0x008080F0:ud
+#else
+ // R = Y + 1.13983*V
+ // R = clip( Y + 1.402*(Cr-128)) // ITU-R
+ mov (1) REG2(r, nTEMP8, 3):f 0.000f // A coef
+ mov (1) REG2(r, nTEMP8, 2):f 1.402f // V coef
+ mov (1) REG2(r, nTEMP8, 1):f 0.0f // U coef
+ mov (1) REG2(r, nTEMP8, 0):f 1.0f // Y coef
+
+ // G = Y - 0.39465*U - 0.58060*V
+ // G = clip( Y - 0.344*(Cb-128) - 0.714*(Cr-128))
+ mov (1) REG2(r, nTEMP8, 7):f 0.000f // A coef
+ mov (1) REG2(r, nTEMP8, 6):f -0.714f // V coef
+ mov (1) REG2(r, nTEMP8, 5):f -0.344f // U coef
+ mov (1) REG2(r, nTEMP8, 4):f 1.0f // Y coef
+
+ // B = Y + 2.03211*U
+ // B = clip( Y + 1.772*(Cb-128))
+ mov (1) REG2(r, nTEMP10, 3):f 0.000f // A coef
+ mov (1) REG2(r, nTEMP10, 2):f 0.0f // V coef
+ mov (1) REG2(r, nTEMP10, 1):f 1.772f // U coef
+ mov (1) REG2(r, nTEMP10, 0):f 1.0f // Y coef
+
+ mov (1) REG2(r, nTEMP10, 4):ud 0x008080F0:ud
+
+ .declare ufROW_YUVA Base=REG(r,nTEMP0) ElementSize=4 SrcRegion=REGION(8,8) Type=f // r nTEMP0 - r nTEMP7
+
+#endif
diff --git a/src/shaders/post_processing/gen5_6/Makefile.am b/src/shaders/post_processing/gen5_6/Makefile.am
index 8658938..052ee53 100755
--- a/src/shaders/post_processing/gen5_6/Makefile.am
+++ b/src/shaders/post_processing/gen5_6/Makefile.am
@@ -21,6 +21,7 @@ INTEL_PP_G4B_GEN5 = \
pa_load_save_nv12.g4b.gen5 \
pa_load_save_pl3.g4b.gen5 \
rgbx_load_save_nv12.g4b.gen5 \
+ nv12_load_save_rgbx.g4b.gen5 \
$(NULL)
INTEL_PP_G6B = \
@@ -37,6 +38,7 @@ INTEL_PP_G6B = \
pa_load_save_nv12.g6b \
pa_load_save_pl3.g6b \
rgbx_load_save_nv12.g6b \
+ nv12_load_save_rgbx.g6b \
$(NULL)
INTEL_PP_ASM = \
@@ -53,6 +55,7 @@ INTEL_PP_ASM = \
pa_load_save_nv12.asm \
pa_load_save_pl3.asm \
rgbx_load_save_nv12.asm \
+ nv12_load_save_rgbx.asm \
$(NULL)
INTEL_PP_ASM += \
@@ -93,6 +96,9 @@ INTEL_PP_ASM += \
Common/RGBX_to_YUV_Coef.asm \
Common/RGBX_Save_YUV_Fix.asm \
Common/RGBX_Save_YUV_Float.asm \
+ Common/YUV_to_RGBX_Coef.asm \
+ Common/YUVX_Save_RGBX_Fix.asm \
+ Common/YUVX_Save_RGBX_Float.asm \
Common/SetupVPKernel.asm \
Common/readSampler16x1.asm \
Core_Kernels/AVS_SetupFirstBlock.asm \
diff --git a/src/shaders/post_processing/gen5_6/nv12_load_save_rgbx.asm b/src/shaders/post_processing/gen5_6/nv12_load_save_rgbx.asm
new file mode 100755
index 0000000..72cd96b
--- /dev/null
+++ b/src/shaders/post_processing/gen5_6/nv12_load_save_rgbx.asm
@@ -0,0 +1,25 @@
+// Module name: NV12_LOAD_SAVE_RGBX
+.kernel NV12_LOAD_SAVE_RGBX
+.code
+#define FIX_POINT_CONVERSION
+// #define FLOAT_POINT_CONVERSION
+
+#include "SetupVPKernel.asm"
+#include "YUV_to_RGBX_Coef.asm"
+#include "Multiple_Loop_Head.asm"
+#include "NV12_Load_8x4.asm"
+#ifdef FIX_POINT_CONVERSION
+ #include "YUVX_Save_RGBX_Fix.asm"
+#else
+ #include "YUVX_Save_RGBX_Float.asm"
+#endif
+#include "RGB16x8_Save_RGB.asm"
+#include "Multiple_Loop.asm"
+
+END_THREAD // End of Thread
+
+.end_code
+
+.end_kernel
+
+// end of nv12_load_save_rgbx.asm
--
1.7.4.1
More information about the Libva
mailing list