[Libva] [PATCH 3/4] add GPU shader for NV12->RGBX conversion

Zhao Halley halley.zhao at intel.com
Thu Jul 12 23:33:29 PDT 2012


---
 .../gen5_6/Common/NV12_Load_8x4.asm                |   32 +-
 .../post_processing/gen5_6/Common/PL2_Load.inc     |    3 +
 .../gen5_6/Common/YUVX_Save_RGBX_Fix.asm           |  178 +++++
 .../gen5_6/Common/YUVX_Save_RGBX_Float.asm         |  287 ++++++++
 .../gen5_6/Common/YUV_to_RGBX_Coef.asm             |   65 ++
 src/shaders/post_processing/gen5_6/Makefile.am     |    6 +
 .../post_processing/gen5_6/nv12_load_save_rgbx.asm |   25 +
 .../gen5_6/nv12_load_save_rgbx.g4b.gen5            |  648 ++++++++++++++++++
 .../post_processing/gen5_6/nv12_load_save_rgbx.g6b |  721 ++++++++++++++++++++
 9 files changed, 1963 insertions(+), 2 deletions(-)
 mode change 100644 => 100755 src/shaders/post_processing/gen5_6/Common/NV12_Load_8x4.asm
 mode change 100644 => 100755 src/shaders/post_processing/gen5_6/Common/PL2_Load.inc
 create mode 100755 src/shaders/post_processing/gen5_6/Common/YUVX_Save_RGBX_Fix.asm
 create mode 100755 src/shaders/post_processing/gen5_6/Common/YUVX_Save_RGBX_Float.asm
 create mode 100755 src/shaders/post_processing/gen5_6/Common/YUV_to_RGBX_Coef.asm
 create mode 100755 src/shaders/post_processing/gen5_6/nv12_load_save_rgbx.asm
 create mode 100644 src/shaders/post_processing/gen5_6/nv12_load_save_rgbx.g4b.gen5
 create mode 100644 src/shaders/post_processing/gen5_6/nv12_load_save_rgbx.g6b

diff --git a/src/shaders/post_processing/gen5_6/Common/NV12_Load_8x4.asm b/src/shaders/post_processing/gen5_6/Common/NV12_Load_8x4.asm
old mode 100644
new mode 100755
index dbc47d4..5d16a1b
--- a/src/shaders/post_processing/gen5_6/Common/NV12_Load_8x4.asm
+++ b/src/shaders/post_processing/gen5_6/Common/NV12_Load_8x4.asm
@@ -29,14 +29,42 @@
     send (8) udSRC_U(0)<1>      mMSGHDRU    udDUMMY_NULL    nDATAPORT_READ    nDPMR_MSGDSC+nDPR_MSG_SIZE_UV+nBI_CURRENT_SRC_UV:ud
 
 // Convert to word-aligned format ----------------------------------------------
-#if !defined(LOAD_UV_ONLY)
+#if defined(FIX_POINT_CONVERSION) || defined(FLOAT_POINT_CONVERSION)
+    // load NV12 and save it as packed AYUV to dst (64x8)
+
+    $for (nY_NUM_OF_ROWS-1; >-1; -1) {
+        // #### Y
+        mov (8)  ubDEST_Y(0,%1*16*4)<4>             ubSRC_Y(0,%1*16)<0;8,1>
+        mov (8)  ubDEST_Y(0,(%1*16+8)*4)<4>         ubSRC_Y(0,%1*16+8)<0;8,1>
+
+        // #### U/V
+        // error from compile: "Invalid horiz size 8", so I have to repeat UV first
+        // mov (4)  ubDEST_Y(0,%1*16*4+1)<8>                   ubSRC_U(0,%1/2*16)<0;4,2>
+        // mov (4)  ubDEST_Y(0,%1*16*4+1+32)<8>                ubSRC_U(0,%1/2*16+8)<0;4,2>
+	
+        // repeate U/V for each one
+        mov (8)     REG2(r,nTEMP18,0)<2>:uw	            uwSRC_U(0,%1/2*8)<0;8,1>
+        mov (8)     REG2(r,nTEMP18,1)<2>:uw	            uwSRC_U(0,%1/2*8)<0;8,1>
+        
+        // mov U/V to ubDEST
+        mov (8)    ubDEST_Y(0,%1*16*4+1)<4>             REG2(r,nTEMP18,0)<0;8,2>:ub
+        mov (8)    ubDEST_Y(0,%1*16*4+1+32)<4>          REG2(r,nTEMP18,16)<0;8,2>:ub
+
+        mov (8)    ubDEST_Y(0,%1*16*4+2)<4>             REG2(r,nTEMP18,1)<0;8,2>:ub
+        mov (8)    ubDEST_Y(0,%1*16*4+2+32)<4>          REG2(r,nTEMP18,17)<0;8,2>:ub
+    }
+#else
+  #if !defined(LOAD_UV_ONLY)
     $for (nY_NUM_OF_ROWS-1; >-1; -1) {
         mov (16)  uwDEST_Y(0,%1*16)<1>      ubSRC_Y(0,%1*16)
     }
-#endif
+  #endif
     $for (nUV_NUM_OF_ROWS/2-1; >-1; -1) {
+        // why "mov (16)"? should it be 8?
         mov (16)  uwDEST_U(0,%1*16)<1>      ubSRC_U(0,%1*32)<32;16,2>
         mov (16)  uwDEST_V(0,%1*16)<1>      ubSRC_U(0,%1*32+1)<32;16,2>
     }
 
+#endif    
+
 // End of NV12_Load_8x4
diff --git a/src/shaders/post_processing/gen5_6/Common/PL2_Load.inc b/src/shaders/post_processing/gen5_6/Common/PL2_Load.inc
old mode 100644
new mode 100755
index 9feeba6..0003dae
--- a/src/shaders/post_processing/gen5_6/Common/PL2_Load.inc
+++ b/src/shaders/post_processing/gen5_6/Common/PL2_Load.inc
@@ -73,6 +73,9 @@
 #define uwDEST_U          uwTOP_U
 #define uwDEST_V          uwTOP_V
 
+#define ubDEST_Y          ubTOP_Y       // I'd like use them for color conversion
+#define uwSRC_U           uwBOT_U
+
 #define nSRC_REGION       nREGION_1     // REGION_1 will be the source region for first kernel
 
 // End of PL2_Load.inc
diff --git a/src/shaders/post_processing/gen5_6/Common/YUVX_Save_RGBX_Fix.asm b/src/shaders/post_processing/gen5_6/Common/YUVX_Save_RGBX_Fix.asm
new file mode 100755
index 0000000..6f4a881
--- /dev/null
+++ b/src/shaders/post_processing/gen5_6/Common/YUVX_Save_RGBX_Fix.asm
@@ -0,0 +1,178 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ * Authors:
+ *    Halley Zhao <halley.zhao at intel.com>
+ */
+
+// Module name: YUVX_Save_RGBX_Fix.asm
+//----------------------------------------------------------------
+
+#include "RGBX_Load_16x8.inc"
+
+#if (0)
+    #define nTEMP0          34        // transformation coefficient
+    #define nTEMP1          35        // one row of R (first half register is used)
+    #define nTEMP2          36        // one row of G (first half register is used)
+    #define nTEMP3          37        // one row of B (first half register is used)
+    #define nTEMP4          38        // mul and add
+    #define nTEMP5          39        // mul and add
+    #define nTEMP6          40        // mul and add
+    #define nTEMP7          41        // mul and add
+    #define nTEMP8          42        // sum of mul
+    #define nTEMP10         44        
+    #define nTEMP10         44        // split ub pixel to word width 1st quarter
+    #define nTEMP12         46        // split ub pixel to word width 2nd quarter
+    #define nTEMP14         48        // split ub pixel to word width 3rd quarter
+    #define nTEMP16         50        // split ub pixel to word width 4th quarter
+    #define nTEMP17         51
+    #define nTEMP18         52
+    
+    #define nTEMP24         58        // temp using for repeat U/V in NV12_Load_8x4.asm
+#endif
+
+#define ONE_ROW_DEBUG                      0
+
+#if (ONE_ROW_DEBUG)
+    #define ROW_NUM                        0
+    #define DBG_ROWNUM_BASE                1
+#else
+    #define ROW_NUM                        %1
+    $for(0; <nY_NUM_OF_ROWS; 1) {
+#endif    
+    // C = Y' - 16          D = U - 128         E = V - 128
+    add (16)     REG2(r,nTEMP10,0)<1>:w           ubDEST_RGBX(0,ROW_NUM*64   )<0;16,1>           REG2(r,nTEMP0,24)<0;4,1>:b
+    add (16)     REG2(r,nTEMP12,0)<1>:w           ubDEST_RGBX(0,ROW_NUM*64+16)<0;16,1>           REG2(r,nTEMP0,24)<0;4,1>:b
+    add (16)     REG2(r,nTEMP14,0)<1>:w           ubDEST_RGBX(0,ROW_NUM*64+32)<0;16,1>           REG2(r,nTEMP0,24)<0;4,1>:b
+    add (16)     REG2(r,nTEMP16,0)<1>:w           ubDEST_RGBX(0,ROW_NUM*64+48)<0;16,1>           REG2(r,nTEMP0,24)<0;4,1>:b
+
+#if (ONE_ROW_DEBUG)
+    mov (16) ubDEST_RGBX(0,(DBG_ROWNUM_BASE)*64   )<1>  REG2(r,nTEMP10, 0)<0;16,2>:ub
+    mov (16) ubDEST_RGBX(0,(DBG_ROWNUM_BASE)*64+16)<1>  REG2(r,nTEMP12, 0)<0;16,2>:ub
+    mov (16) ubDEST_RGBX(0,(DBG_ROWNUM_BASE)*64+32)<1>  REG2(r,nTEMP14, 0)<0;16,2>:ub
+    mov (16) ubDEST_RGBX(0,(DBG_ROWNUM_BASE)*64+48)<1>  REG2(r,nTEMP16, 0)<0;16,2>:ub
+#endif
+    
+    // |Y|U|V|X|==>|R|G|B|X|  
+    // ###### do one row for R
+    // #### mul and add
+    mul.sat (16)  REG2(r, nTEMP4, 0)<1>:w      REG2(r,nTEMP10,0)<0;16,1>:w        REG2(r, nTEMP0, 0)<0;4,1>:w
+    mul.sat (16)  REG2(r, nTEMP5, 0)<1>:w      REG2(r,nTEMP12,0)<0;16,1>:w        REG2(r, nTEMP0, 0)<0;4,1>:w
+    mul.sat (16)  REG2(r, nTEMP6, 0)<1>:w      REG2(r,nTEMP14,0)<0;16,1>:w        REG2(r, nTEMP0, 0)<0;4,1>:w
+    mul.sat (16)  REG2(r, nTEMP7, 0)<1>:w      REG2(r,nTEMP16,0)<0;16,1>:w        REG2(r, nTEMP0, 0)<0;4,1>:w
+
+  #if (ONE_ROW_DEBUG)
+    mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+1)*64+2   )<4>  REG2(r,nTEMP0, 0)<0;8,1>:ub
+    mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+1)*64+2+32)<4>  REG2(r,nTEMP0, 0)<0;8,1>:ub
+
+    mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+2)*64+2   )<4>  REG2(r,nTEMP4, 0)<0;8,1>:ub
+    mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+2)*64+2+32)<4>  REG2(r,nTEMP4, 8)<0;8,1>:ub
+  #endif
+
+    add.sat (4)   REG2(r, nTEMP4, 0)<4>:uw      REG2(r, nTEMP4, 0)<0;4,4>:w      REG2(r, nTEMP4, 2)<0;4,4>:w      
+    add.sat (4)   REG2(r, nTEMP5, 0)<4>:uw      REG2(r, nTEMP5, 0)<0;4,4>:w      REG2(r, nTEMP5, 2)<0;4,4>:w      
+    add.sat (4)   REG2(r, nTEMP6, 0)<4>:uw      REG2(r, nTEMP6, 0)<0;4,4>:w      REG2(r, nTEMP6, 2)<0;4,4>:w      
+    add.sat (4)   REG2(r, nTEMP7, 0)<4>:uw      REG2(r, nTEMP7, 0)<0;4,4>:w      REG2(r, nTEMP7, 2)<0;4,4>:w      
+
+    // ####  write one row of R to rnTEMP1
+    mov (4)  REG2(r, nTEMP8,  0)<1>:uw    REG2(r, nTEMP4, 0)<0; 4, 4>:uw
+    mov (4)  REG2(r, nTEMP8,  4)<1>:uw    REG2(r, nTEMP5, 0)<0; 4, 4>:uw
+    mov (4)  REG2(r, nTEMP8,  8)<1>:uw    REG2(r, nTEMP6, 0)<0; 4, 4>:uw
+    mov (4)  REG2(r, nTEMP8, 12)<1>:uw    REG2(r, nTEMP7, 0)<0; 4, 4>:uw
+
+  #if (ONE_ROW_DEBUG)
+    mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+3)*64+2   )<4>  REG2(r,nTEMP8, 0)<0;8,1>:ub
+    mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+3)*64+2+32)<4>  REG2(r,nTEMP8, 8)<0;8,1>:ub
+  #endif    
+
+    add.sat (16) REG2(r, nTEMP8,  0)<1>:uw    REG2(r, nTEMP8, 0)<0; 16, 1>:uw    0x80:uw // todo, combine mov and add
+    shl.sat (16) REG2(r, nTEMP8,  0)<1>:uw    REG2(r, nTEMP8, 0)<0; 16, 1>:uw    1:w
+    mov (16) REG2(r, nTEMP1,  0)<1>:ub   REG2(r, nTEMP8, 1)<0; 16, 2>:ub
+    
+  #if (ONE_ROW_DEBUG)
+    mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+4)*64+2   )<4>  REG2(r,nTEMP8, 0)<0;8,1>:ub
+    mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+4)*64+2+32)<4>  REG2(r,nTEMP8, 8)<0;8,1>:ub
+  #endif    
+    // ###### do one row for G
+    // #### mul and add
+    mul.sat (16)  REG2(r, nTEMP4, 0)<1>:w      REG2(r,nTEMP10,0)<0;16,1>:w        REG2(r, nTEMP0, 4)<0;4,1>:w
+    mul.sat (16)  REG2(r, nTEMP5, 0)<1>:w      REG2(r,nTEMP12,0)<0;16,1>:w        REG2(r, nTEMP0, 4)<0;4,1>:w
+    mul.sat (16)  REG2(r, nTEMP6, 0)<1>:w      REG2(r,nTEMP14,0)<0;16,1>:w        REG2(r, nTEMP0, 4)<0;4,1>:w
+    mul.sat (16)  REG2(r, nTEMP7, 0)<1>:w      REG2(r,nTEMP16,0)<0;16,1>:w        REG2(r, nTEMP0, 4)<0;4,1>:w
+
+    add.sat (4)   REG2(r, nTEMP4, 0)<4>:uw      REG2(r, nTEMP4, 0)<0;4,4>:w      REG2(r, nTEMP4, 1)<0;4,4>:w      
+    add.sat (4)   REG2(r, nTEMP4, 0)<4>:uw      REG2(r, nTEMP4, 0)<0;4,4>:uw      REG2(r, nTEMP4, 2)<0;4,4>:w      
+    add.sat (4)   REG2(r, nTEMP5, 0)<4>:uw      REG2(r, nTEMP5, 0)<0;4,4>:w      REG2(r, nTEMP5, 1)<0;4,4>:w      
+    add.sat (4)   REG2(r, nTEMP5, 0)<4>:uw      REG2(r, nTEMP5, 0)<0;4,4>:uw      REG2(r, nTEMP5, 2)<0;4,4>:w      
+    add.sat (4)   REG2(r, nTEMP6, 0)<4>:uw      REG2(r, nTEMP6, 0)<0;4,4>:w      REG2(r, nTEMP6, 1)<0;4,4>:w      
+    add.sat (4)   REG2(r, nTEMP6, 0)<4>:uw      REG2(r, nTEMP6, 0)<0;4,4>:uw      REG2(r, nTEMP6, 2)<0;4,4>:w      
+    add.sat (4)   REG2(r, nTEMP7, 0)<4>:uw      REG2(r, nTEMP7, 0)<0;4,4>:w      REG2(r, nTEMP7, 1)<0;4,4>:w      
+    add.sat (4)   REG2(r, nTEMP7, 0)<4>:uw      REG2(r, nTEMP7, 0)<0;4,4>:uw      REG2(r, nTEMP7, 2)<0;4,4>:w      
+
+    // ####  write one row of G to rnTEMP2
+    mov (4)  REG2(r, nTEMP8,  0)<1>:uw    REG2(r, nTEMP4, 0)<0; 4, 4>:uw
+    mov (4)  REG2(r, nTEMP8,  4)<1>:uw    REG2(r, nTEMP5, 0)<0; 4, 4>:uw
+    mov (4)  REG2(r, nTEMP8,  8)<1>:uw    REG2(r, nTEMP6, 0)<0; 4, 4>:uw
+    mov (4)  REG2(r, nTEMP8, 12)<1>:uw    REG2(r, nTEMP7, 0)<0; 4, 4>:uw
+    
+    add (16) REG2(r, nTEMP8,  0)<1>:uw    REG2(r, nTEMP8, 0)<0; 16, 1>:uw    0x80:uw // saturation
+    shl.sat (16) REG2(r, nTEMP8,  0)<1>:uw    REG2(r, nTEMP8, 0)<0; 16, 1>:uw    1:w
+    mov (16) REG2(r, nTEMP2,  0)<1>:ub   REG2(r, nTEMP8, 1)<0; 16, 2>:ub
+
+    // ###### do one row for B
+    // #### mul and add
+    mul.sat (16)  REG2(r, nTEMP4, 0)<1>:w      REG2(r,nTEMP10,0)<0;16,1>:w        REG2(r, nTEMP0, 8)<0;4,1>:w
+    mul.sat (16)  REG2(r, nTEMP5, 0)<1>:w      REG2(r,nTEMP12,0)<0;16,1>:w        REG2(r, nTEMP0, 8)<0;4,1>:w
+    mul.sat (16)  REG2(r, nTEMP6, 0)<1>:w      REG2(r,nTEMP14,0)<0;16,1>:w        REG2(r, nTEMP0, 8)<0;4,1>:w
+    mul.sat (16)  REG2(r, nTEMP7, 0)<1>:w      REG2(r,nTEMP16,0)<0;16,1>:w        REG2(r, nTEMP0, 8)<0;4,1>:w
+
+  #if (ONE_ROW_DEBUG)
+    mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+1)*64   )<4>  REG2(r,nTEMP0, 16)<0;8,1>:ub
+    mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+1)*64+32)<4>  REG2(r,nTEMP0, 16)<0;8,1>:ub
+
+    mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+2)*64   )<4>  REG2(r,nTEMP4, 0)<0;8,1>:ub
+    mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+2)*64+32)<4>  REG2(r,nTEMP4, 8)<0;8,1>:ub
+  #endif
+    
+    add.sat (4)   REG2(r, nTEMP4, 0)<4>:uw      REG2(r, nTEMP4, 0)<0;4,4>:w      REG2(r, nTEMP4, 1)<0;4,4>:w      
+    add.sat (4)   REG2(r, nTEMP5, 0)<4>:uw      REG2(r, nTEMP5, 0)<0;4,4>:w      REG2(r, nTEMP5, 1)<0;4,4>:w      
+    add.sat (4)   REG2(r, nTEMP6, 0)<4>:uw      REG2(r, nTEMP6, 0)<0;4,4>:w      REG2(r, nTEMP6, 1)<0;4,4>:w      
+    add.sat (4)   REG2(r, nTEMP7, 0)<4>:uw      REG2(r, nTEMP7, 0)<0;4,4>:w      REG2(r, nTEMP7, 1)<0;4,4>:w      
+
+    // ####  write one row of B to rnTEMP3
+    mov (4)  REG2(r, nTEMP8,  0)<1>:uw    REG2(r, nTEMP4, 0)<0; 4, 4>:uw
+    mov (4)  REG2(r, nTEMP8,  4)<1>:uw    REG2(r, nTEMP5, 0)<0; 4, 4>:uw
+    mov (4)  REG2(r, nTEMP8,  8)<1>:uw    REG2(r, nTEMP6, 0)<0; 4, 4>:uw
+    mov (4)  REG2(r, nTEMP8, 12)<1>:uw    REG2(r, nTEMP7, 0)<0; 4, 4>:uw
+
+  #if (ONE_ROW_DEBUG)
+    mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+3)*64   )<4>  REG2(r,nTEMP8, 0)<0;8,1>:ub
+    mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+3)*64+32)<4>  REG2(r,nTEMP8, 8)<0;8,1>:ub
+  #endif    
+
+    add.sat (16) REG2(r, nTEMP8,  0)<1>:uw    REG2(r, nTEMP8, 0)<0; 16, 1>:uw    0x80:uw // saturation
+    shl.sat (16) REG2(r, nTEMP8,  0)<1>:uw    REG2(r, nTEMP8, 0)<0; 16, 1>:uw    1:w
+    mov (16) REG2(r, nTEMP3,  0)<1>:ub   REG2(r, nTEMP8, 1)<0; 16, 2>:ub
+
+  #if (ONE_ROW_DEBUG)
+    mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+4)*64   )<4>  REG2(r,nTEMP8, 0)<0;8,1>:ub
+    mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+4)*64+32)<4>  REG2(r,nTEMP8, 8)<0;8,1>:ub
+  #endif    
+
+    // B
+    mov (8) ubDEST_RGBX(0,ROW_NUM*64   )<4>  REG2(r,nTEMP3, 0)<0;8,1>:ub
+    mov (8) ubDEST_RGBX(0,ROW_NUM*64+32)<4>  REG2(r,nTEMP3, 8)<0;8,1>:ub
+    // G
+    mov (8) ubDEST_RGBX(0,ROW_NUM*64+1   )<4>  REG2(r,nTEMP2, 0)<0;8,1>:ub
+    mov (8) ubDEST_RGBX(0,ROW_NUM*64+1+32)<4>  REG2(r,nTEMP2, 8)<0;8,1>:ub
+    // R
+    mov (8) ubDEST_RGBX(0,ROW_NUM*64+2   )<4>  REG2(r,nTEMP1, 0)<0;8,1>:ub
+    mov (8) ubDEST_RGBX(0,ROW_NUM*64+2+32)<4>  REG2(r,nTEMP1, 8)<0;8,1>:ub
+#if (!ONE_ROW_DEBUG)    
+    }
+#endif
diff --git a/src/shaders/post_processing/gen5_6/Common/YUVX_Save_RGBX_Float.asm b/src/shaders/post_processing/gen5_6/Common/YUVX_Save_RGBX_Float.asm
new file mode 100755
index 0000000..f7585d4
--- /dev/null
+++ b/src/shaders/post_processing/gen5_6/Common/YUVX_Save_RGBX_Float.asm
@@ -0,0 +1,287 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ * Authors:
+ *    Halley Zhao <halley.zhao at intel.com>
+ */
+
+// Module name: YUVX_Save_RGBX_Float.asm
+//----------------------------------------------------------------
+
+#include "RGBX_Load_16x8.inc"
+
+#if (0)
+    // 8 grf reg for one row of pixel (2 pixel per grf)
+    #define nTEMP0          34
+    #define nTEMP1          35
+    #define nTEMP2          36
+    #define nTEMP3          37
+    #define nTEMP4          38
+    #define nTEMP5          39
+    #define nTEMP6          40
+    #define nTEMP7          41
+    
+    #define nTEMP8          42        // transformation coefficient
+    #define nTEMP10         44        // transformation coefficient
+    
+    #define nTEMP12         46        // save Y/U/V in ub format
+    #define nTEMP14         48        // save YUV in ud format
+    #define nTEMP16         50        // dp4 result
+    #define nTEMP17         51        
+    #define nTEMP18         52       // temp used for repeat U/V in NV12_Load_8x4.asm
+    
+    #define nTEMP24         58       // it is not safe to use in my case. I try to use it for repeat U/V in NV12_Load_8x4.asm, Y data is taint in row 4/5
+#endif
+
+#define ONE_ROW_DEBUG                      0
+
+#if (ONE_ROW_DEBUG)
+    // if you want to debug a row which is not the first one, try the following:
+    // 1. define ROW_NUM_READ to the row you want to debug
+    // 2. ROW_NUM_WRITE can be same to DBG_ROWNUM_READ to overwrite original YUV data, or define it to a new row
+    // 3. change (DBG_ROWNUM_BASE+?)=ROW_NUM_READ or ROW_NUM_WRITE to DBG_ROWNUM_0, to not conflict with others
+    #define ROW_NUM_READ                   0
+    #define ROW_NUM_WRITE                  0
+    #define DBG_ROWNUM_BASE                1
+    #define DBG_ROWNUM_0                   0
+#else
+    #define ROW_NUM_READ                   %1
+    #define ROW_NUM_WRITE                  %1
+    $for(0; <nY_NUM_OF_ROWS; 1) {
+#endif    
+    // YUVX | Y | U | V | X |
+    // XRGB | B | G | R | X |
+    // ###### save one row of pixel to temp grf with float format (required by dp4)
+    // C = Y' - 16          D = U - 128         E = V - 128
+
+    // the follow sentence doesn't work, I have to split it into two step
+    // add (4) REG(r, nTEMP0)<1>:f       r[SRC_RGBA_OFFSET_1,ROW_NUM_READ*32 +  0]<4,1>:ub           REG2(r, nTEMP10,  16)<0;4,1>:b
+
+    add (16)     REG2(r,nTEMP12,0)<1>:w           ubDEST_RGBX(0,ROW_NUM_READ*64   )<0;16,1>           REG2(r,nTEMP10,16)<0;4,1>:b
+    add (16)     REG2(r,nTEMP14,0)<1>:w           ubDEST_RGBX(0,ROW_NUM_READ*64+16)<0;16,1>           REG2(r,nTEMP10,16)<0;4,1>:b
+    add (16)     REG2(r,nTEMP16,0)<1>:w           ubDEST_RGBX(0,ROW_NUM_READ*64+32)<0;16,1>           REG2(r,nTEMP10,16)<0;4,1>:b
+    add (16)     REG2(r,nTEMP17,0)<1>:w           ubDEST_RGBX(0,ROW_NUM_READ*64+48)<0;16,1>           REG2(r,nTEMP10,16)<0;4,1>:b
+    
+    mov (8)      ufROW_YUVA(0,0)<1>            REG2(r, nTEMP12, 0)<0;8,1>:w
+    mov (8)      ufROW_YUVA(1,0)<1>            REG2(r, nTEMP12, 8)<0;8,1>:w
+    mov (8)      ufROW_YUVA(2,0)<1>            REG2(r, nTEMP14, 0)<0;8,1>:w
+    mov (8)      ufROW_YUVA(3,0)<1>            REG2(r, nTEMP14, 8)<0;8,1>:w
+    mov (8)      ufROW_YUVA(4,0)<1>            REG2(r, nTEMP16, 0)<0;8,1>:w
+    mov (8)      ufROW_YUVA(5,0)<1>            REG2(r, nTEMP16, 8)<0;8,1>:w
+    mov (8)      ufROW_YUVA(6,0)<1>            REG2(r, nTEMP17, 0)<0;8,1>:w
+    mov (8)      ufROW_YUVA(7,0)<1>            REG2(r, nTEMP17, 8)<0;8,1>:w
+
+  #if (ONE_ROW_DEBUG)
+    mov.sat (8)  REG2(r, nTEMP14,  0)<1>:ud     ufROW_YUVA(0,0)<0;8,1>:f
+    mov (8)  REG2(r, nTEMP12,   0)<1>:ub        REG2(r, nTEMP14, 0)<0;8,4>:ub
+    
+        // write Y-16, U-128, V-128 to the 2nd row of RGB (convert float to int first, write whole ud): 1st half, 2 pixels
+        mov (16) ubDEST_RGBX(0,(DBG_ROWNUM_BASE)*64   )<1>  REG2(r,nTEMP14,  0)<0;16,1>:ub
+        mov (16) ubDEST_RGBX(0,(DBG_ROWNUM_BASE)*64+16)<1>  REG2(r,nTEMP14, 16)<0;16,1>:ub
+
+    mov.sat (8)  REG2(r, nTEMP14,  0)<1>:ud     ufROW_YUVA(1,0)<0;8,1>:f
+    mov (8)  REG2(r, nTEMP12,   8)<1>:ub        REG2(r, nTEMP14, 0)<0;8,4>:ub
+    
+        // write Y-16, U-128, V-128 to the 2nd row of RGB (convert float to int first, write whole ud): 2nd half, 2 pixels
+        mov (16) ubDEST_RGBX(0,(DBG_ROWNUM_BASE)*64+32)<1>  REG2(r,nTEMP14,  0)<0;16,1>:ub
+        mov (16) ubDEST_RGBX(0,(DBG_ROWNUM_BASE)*64+48)<1>  REG2(r,nTEMP14, 16)<0;16,1>:ub
+
+    mov.sat (8)  REG2(r, nTEMP14,  0)<1>:ud     ufROW_YUVA(2,0)<0;8,1>:f
+    mov (8)  REG2(r, nTEMP12,  16)<1>:ub        REG2(r, nTEMP14, 0)<0;8,4>:ub
+    
+    mov.sat (8)  REG2(r, nTEMP14,  0)<1>:ud     ufROW_YUVA(3,0)<0;8,1>:f
+    mov (8)  REG2(r, nTEMP12,  24)<1>:ub        REG2(r, nTEMP14, 0)<0;8,4>:ub
+
+        // write Y-16, U-128, V-128 to the 3rd row of RGB (convert float to int first, only LSB is used): 1st half, 8 pixels
+        mov (16) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+1)*64   )<1>  REG2(r,nTEMP12,  0)<0;16,1>:ub
+        mov (16) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+1)*64+16)<1>  REG2(r,nTEMP12, 16)<0;16,1>:ub
+    
+    mov.sat (8)  REG2(r, nTEMP14,  0)<1>:ud     ufROW_YUVA(4,0)<0;8,1>:f
+    mov (8)  REG2(r, nTEMP12,   0)<1>:ub        REG2(r, nTEMP14, 0)<0;8,4>:ub
+    
+    mov.sat (8)  REG2(r, nTEMP14,  0)<1>:ud     ufROW_YUVA(5,0)<0;8,1>:f
+    mov (8)  REG2(r, nTEMP12,   8)<1>:ub        REG2(r, nTEMP14, 0)<0;8,4>:ub
+
+    mov.sat (8)  REG2(r, nTEMP14,  0)<1>:ud     ufROW_YUVA(6,0)<0;8,1>:f
+    mov (8)  REG2(r, nTEMP12,  16)<1>:ub        REG2(r, nTEMP14, 0)<0;8,4>:ub
+    
+    mov.sat (8)  REG2(r, nTEMP14,  0)<1>:ud     ufROW_YUVA(7,0)<0;8,1>:f
+    mov (8)  REG2(r, nTEMP12,  24)<1>:ub        REG2(r, nTEMP14, 0)<0;8,4>:ub
+
+        // write Y-16, U-128, V-128 to the 3rd row of RGB (convert float to int first, only LSB is used): 2nd half, 8 pixels
+        mov (16) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+1)*64+32)<1>  REG2(r,nTEMP12,  0)<0;16,1>:ub
+        mov (16) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+1)*64+48)<1>  REG2(r,nTEMP12, 16)<0;16,1>:ub
+  #endif
+
+        // ######## do one row for Red ########
+    #define COEF_REG  REG2(r, nTEMP8, 0)  // reg for Red coefficient
+    #define CHANNEL   2
+    // ##### dp4(nTEMP16) and save result to uw format(nTEMP12)
+    dp4 (8)  REG2(r, nTEMP16,  0)<1>:f          ufROW_YUVA(0, 0)<0;8,1>   COEF_REG<0;4,1>:f 
+    mov.sat (2)  REG2(r, nTEMP14,  0)<1>:ud     REG2(r, nTEMP16, 0)<0;2,4>:f
+    mov (2)  REG2(r, nTEMP12,  0)<1>:ub         REG2(r, nTEMP14, 0)<0;2,4>:ub      // todo for saturation
+
+  #if (ONE_ROW_DEBUG)
+    // write dp4 (raw float) of 2 pixel to the 4/5th row
+    mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+2)*64+CHANNEL   )<4>  REG2(r,nTEMP16,  0)<0;8,1>:ub
+    mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+2)*64+CHANNEL+32)<4>  REG2(r,nTEMP16,  8)<0;8,1>:ub
+    mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+3)*64+CHANNEL   )<4>  REG2(r,nTEMP16, 16)<0;8,1>:ub
+    mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+3)*64+CHANNEL+32)<4>  REG2(r,nTEMP16, 24)<0;8,1>:ub
+
+    // write dp4 (convert float to ud first, write whole ud) of 2 pixel to the 6/7th row
+    mov (8)  REG2(r, nTEMP17,  0)<1>:d     REG2(r, nTEMP16, 0)<0;8,1>:f
+    mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+4)*64+CHANNEL   )<4>  REG2(r,nTEMP17,  0)<0;8,1>:ub
+    mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+4)*64+CHANNEL+32)<4>  REG2(r,nTEMP17,  8)<0;8,1>:ub
+    mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+5)*64+CHANNEL   )<4>  REG2(r,nTEMP17, 16)<0;8,1>:ub
+    mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+5)*64+CHANNEL+32)<4>  REG2(r,nTEMP17, 24)<0;8,1>:ub
+  #endif    
+
+    dp4 (8)  REG2(r, nTEMP16,  0)<1>:f          ufROW_YUVA(1, 0)<0;8,1>         COEF_REG<0;4,1>:f 
+    mov.sat (2)  REG2(r, nTEMP14,  0)<1>:ud     REG2(r, nTEMP16, 0)<0;2,4>:f
+    mov (2)  REG2(r, nTEMP12,  2)<1>:ub         REG2(r, nTEMP14, 0)<0;2,4>:ub
+    
+    dp4 (8)  REG2(r, nTEMP16,  0)<1>:f          ufROW_YUVA(2, 0)<0;8,1>         COEF_REG<0;4,1>:f 
+    mov.sat (2)  REG2(r, nTEMP14,  0)<1>:ud     REG2(r, nTEMP16, 0)<0;2,4>:f
+    mov (2)  REG2(r, nTEMP12,  4)<1>:ub         REG2(r, nTEMP14, 0)<0;2,4>:ub
+    
+    dp4 (8)  REG2(r, nTEMP16,  0)<1>:f          ufROW_YUVA(3, 0)<0;8,1>         COEF_REG<0;4,1>:f 
+    mov.sat (2)  REG2(r, nTEMP14,  0)<1>:ud     REG2(r, nTEMP16, 0)<0;2,4>:f
+    mov (2)  REG2(r, nTEMP12,  6)<1>:ub         REG2(r, nTEMP14, 0)<0;2,4>:ub
+    
+    dp4 (8)  REG2(r, nTEMP16,  0)<1>:f          ufROW_YUVA(4, 0)<0;8,1>         COEF_REG<0;4,1>:f 
+    mov.sat (2)  REG2(r, nTEMP14,  0)<1>:ud     REG2(r, nTEMP16, 0)<0;2,4>:f
+    mov (2)  REG2(r, nTEMP12,  8)<1>:ub         REG2(r, nTEMP14, 0)<0;2,4>:ub
+    
+    dp4 (8)  REG2(r, nTEMP16,  0)<1>:f          ufROW_YUVA(5, 0)<5;8,1>         COEF_REG<0;4,1>:f 
+    mov.sat (2)  REG2(r, nTEMP14,  0)<1>:ud     REG2(r, nTEMP16, 0)<0;2,4>:f
+    mov (2)  REG2(r, nTEMP12, 10)<1>:ub         REG2(r, nTEMP14, 0)<0;2,4>:ub
+    
+    dp4 (8)  REG2(r, nTEMP16,  0)<1>:f          ufROW_YUVA(6, 0)<6;8,1>         COEF_REG<0;4,1>:f 
+    mov.sat (2)  REG2(r, nTEMP14,  0)<1>:ud     REG2(r, nTEMP16, 0)<0;2,4>:f
+    mov (2)  REG2(r, nTEMP12, 12)<1>:ub         REG2(r, nTEMP14, 0)<0;2,4>:ub
+    
+    dp4 (8)  REG2(r, nTEMP16,  0)<1>:f          ufROW_YUVA(7, 0)<0;8,1>         COEF_REG<0;4,1>:f 
+    mov.sat (2)  REG2(r, nTEMP14,  0)<1>:ud     REG2(r, nTEMP16, 0)<0;2,4>:f
+    mov (2)  REG2(r, nTEMP12, 14)<1>:ub         REG2(r, nTEMP14, 0)<0;2,4>:ub
+
+    // ####  write this channel
+    mov (8) ubDEST_RGBX(0,ROW_NUM_WRITE*64+CHANNEL   )<4>  REG2(r,nTEMP12, 0)<0;8,1>:ub
+    mov (8) ubDEST_RGBX(0,ROW_NUM_WRITE*64+CHANNEL+32)<4>  REG2(r,nTEMP12, 8)<0;8,1>:ub
+    
+        // ######## do one row for Green ########
+    #define COEF_REG  REG2(r, nTEMP8, 4)  // reg for green coefficient
+    #define CHANNEL   1
+    // ##### dp4(nTEMP16) and save result to uw format(nTEMP12)
+    dp4 (8)  REG2(r, nTEMP16,  0)<1>:f          ufROW_YUVA(0, 0)<0;8,1>   COEF_REG<0;4,1>:f 
+    mov.sat (2)  REG2(r, nTEMP14,  0)<1>:ud     REG2(r, nTEMP16, 0)<0;2,4>:f
+    mov (2)  REG2(r, nTEMP12,  0)<1>:ub         REG2(r, nTEMP14, 0)<0;2,4>:ub      // todo for saturation
+
+  #if (ONE_ROW_DEBUG)
+    // write dp4 (raw float) of 2 pixel to the 4/5th row
+    mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+2)*64+CHANNEL   )<4>  REG2(r,nTEMP16,  0)<0;8,1>:ub
+    mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+2)*64+CHANNEL+32)<4>  REG2(r,nTEMP16,  8)<0;8,1>:ub
+    mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+3)*64+CHANNEL   )<4>  REG2(r,nTEMP16, 16)<0;8,1>:ub
+    mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+3)*64+CHANNEL+32)<4>  REG2(r,nTEMP16, 24)<0;8,1>:ub
+
+    // write dp4 (convert float to ud first, write whole ud) of 2 pixel to the 6/7th row
+    mov (8)  REG2(r, nTEMP17,  0)<1>:d     REG2(r, nTEMP16, 0)<0;8,1>:f
+    mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+4)*64+CHANNEL   )<4>  REG2(r,nTEMP17,  0)<0;8,1>:ub
+    mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+4)*64+CHANNEL+32)<4>  REG2(r,nTEMP17,  8)<0;8,1>:ub
+    mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+5)*64+CHANNEL   )<4>  REG2(r,nTEMP17, 16)<0;8,1>:ub
+    mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+5)*64+CHANNEL+32)<4>  REG2(r,nTEMP17, 24)<0;8,1>:ub
+  #endif    
+
+    dp4 (8)  REG2(r, nTEMP16,  0)<1>:f          ufROW_YUVA(1, 0)<0;8,1>         COEF_REG<0;4,1>:f 
+    mov.sat (2)  REG2(r, nTEMP14,  0)<1>:ud     REG2(r, nTEMP16, 0)<0;2,4>:f
+    mov (2)  REG2(r, nTEMP12,  2)<1>:ub         REG2(r, nTEMP14, 0)<0;2,4>:ub
+    
+    dp4 (8)  REG2(r, nTEMP16,  0)<1>:f          ufROW_YUVA(2, 0)<0;8,1>         COEF_REG<0;4,1>:f 
+    mov.sat (2)  REG2(r, nTEMP14,  0)<1>:ud     REG2(r, nTEMP16, 0)<0;2,4>:f
+    mov (2)  REG2(r, nTEMP12,  4)<1>:ub         REG2(r, nTEMP14, 0)<0;2,4>:ub
+    
+    dp4 (8)  REG2(r, nTEMP16,  0)<1>:f          ufROW_YUVA(3, 0)<0;8,1>         COEF_REG<0;4,1>:f 
+    mov.sat (2)  REG2(r, nTEMP14,  0)<1>:ud     REG2(r, nTEMP16, 0)<0;2,4>:f
+    mov (2)  REG2(r, nTEMP12,  6)<1>:ub         REG2(r, nTEMP14, 0)<0;2,4>:ub
+    
+    dp4 (8)  REG2(r, nTEMP16,  0)<1>:f          ufROW_YUVA(4, 0)<0;8,1>         COEF_REG<0;4,1>:f 
+    mov.sat (2)  REG2(r, nTEMP14,  0)<1>:ud     REG2(r, nTEMP16, 0)<0;2,4>:f
+    mov (2)  REG2(r, nTEMP12,  8)<1>:ub         REG2(r, nTEMP14, 0)<0;2,4>:ub
+    
+    dp4 (8)  REG2(r, nTEMP16,  0)<1>:f          ufROW_YUVA(5, 0)<5;8,1>         COEF_REG<0;4,1>:f 
+    mov.sat (2)  REG2(r, nTEMP14,  0)<1>:ud     REG2(r, nTEMP16, 0)<0;2,4>:f
+    mov (2)  REG2(r, nTEMP12, 10)<1>:ub         REG2(r, nTEMP14, 0)<0;2,4>:ub
+    
+    dp4 (8)  REG2(r, nTEMP16,  0)<1>:f          ufROW_YUVA(6, 0)<6;8,1>         COEF_REG<0;4,1>:f 
+    mov.sat (2)  REG2(r, nTEMP14,  0)<1>:ud     REG2(r, nTEMP16, 0)<0;2,4>:f
+    mov (2)  REG2(r, nTEMP12, 12)<1>:ub         REG2(r, nTEMP14, 0)<0;2,4>:ub
+    
+    dp4 (8)  REG2(r, nTEMP16,  0)<1>:f          ufROW_YUVA(7, 0)<0;8,1>         COEF_REG<0;4,1>:f 
+    mov.sat (2)  REG2(r, nTEMP14,  0)<1>:ud     REG2(r, nTEMP16, 0)<0;2,4>:f
+    mov (2)  REG2(r, nTEMP12, 14)<1>:ub         REG2(r, nTEMP14, 0)<0;2,4>:ub
+
+    // ####  write this channel
+    mov (8) ubDEST_RGBX(0,ROW_NUM_WRITE*64+CHANNEL   )<4>  REG2(r,nTEMP12, 0)<0;8,1>:ub
+    mov (8) ubDEST_RGBX(0,ROW_NUM_WRITE*64+CHANNEL+32)<4>  REG2(r,nTEMP12, 8)<0;8,1>:ub
+    
+    // ###### do one row for Blue channel
+    #define COEF_REG  REG2(r, nTEMP10, 0)  // reg for Blue coefficient
+    #define CHANNEL   0
+    // ##### dp4(nTEMP16) and save result to uw format(nTEMP12)
+    dp4 (8)  REG2(r, nTEMP16,  0)<1>:f          ufROW_YUVA(0, 0)<0;8,1>   COEF_REG<0;4,1>:f 
+    mov.sat (2)  REG2(r, nTEMP14,  0)<1>:ud     REG2(r, nTEMP16, 0)<0;2,4>:f
+    mov (2)  REG2(r, nTEMP12,  0)<1>:ub         REG2(r, nTEMP14, 0)<0;2,4>:ub      // todo for saturation
+
+  #if (ONE_ROW_DEBUG)
+    // write dp4 (raw float) of 2 pixel to the 4/5th row
+    mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+2)*64+CHANNEL   )<4>  REG2(r,nTEMP16,  0)<0;8,1>:ub
+    mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+2)*64+CHANNEL+32)<4>  REG2(r,nTEMP16,  8)<0;8,1>:ub
+    mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+3)*64+CHANNEL   )<4>  REG2(r,nTEMP16, 16)<0;8,1>:ub
+    mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+3)*64+CHANNEL+32)<4>  REG2(r,nTEMP16, 24)<0;8,1>:ub
+
+    // write dp4 (convert float to ud first, write whole ud) of 2 pixel to the 6/7th row
+    mov (8)  REG2(r, nTEMP17,  0)<1>:d     REG2(r, nTEMP16, 0)<0;8,1>:f
+    mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+4)*64+CHANNEL   )<4>  REG2(r,nTEMP17,  0)<0;8,1>:ub
+    mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+4)*64+CHANNEL+32)<4>  REG2(r,nTEMP17,  8)<0;8,1>:ub
+    mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+5)*64+CHANNEL   )<4>  REG2(r,nTEMP17, 16)<0;8,1>:ub
+    mov (8) ubDEST_RGBX(0,(DBG_ROWNUM_BASE+5)*64+CHANNEL+32)<4>  REG2(r,nTEMP17, 24)<0;8,1>:ub
+  #endif    
+
+    dp4 (8)  REG2(r, nTEMP16,  0)<1>:f          ufROW_YUVA(1, 0)<0;8,1>         COEF_REG<0;4,1>:f 
+    mov.sat (2)  REG2(r, nTEMP14,  0)<1>:ud     REG2(r, nTEMP16, 0)<0;2,4>:f
+    mov (2)  REG2(r, nTEMP12,  2)<1>:ub         REG2(r, nTEMP14, 0)<0;2,4>:ub
+    
+    dp4 (8)  REG2(r, nTEMP16,  0)<1>:f          ufROW_YUVA(2, 0)<0;8,1>         COEF_REG<0;4,1>:f 
+    mov.sat (2)  REG2(r, nTEMP14,  0)<1>:ud     REG2(r, nTEMP16, 0)<0;2,4>:f
+    mov (2)  REG2(r, nTEMP12,  4)<1>:ub         REG2(r, nTEMP14, 0)<0;2,4>:ub
+    
+    dp4 (8)  REG2(r, nTEMP16,  0)<1>:f          ufROW_YUVA(3, 0)<0;8,1>         COEF_REG<0;4,1>:f 
+    mov.sat (2)  REG2(r, nTEMP14,  0)<1>:ud     REG2(r, nTEMP16, 0)<0;2,4>:f
+    mov (2)  REG2(r, nTEMP12,  6)<1>:ub         REG2(r, nTEMP14, 0)<0;2,4>:ub
+    
+    dp4 (8)  REG2(r, nTEMP16,  0)<1>:f          ufROW_YUVA(4, 0)<0;8,1>         COEF_REG<0;4,1>:f 
+    mov.sat (2)  REG2(r, nTEMP14,  0)<1>:ud     REG2(r, nTEMP16, 0)<0;2,4>:f
+    mov (2)  REG2(r, nTEMP12,  8)<1>:ub         REG2(r, nTEMP14, 0)<0;2,4>:ub
+    
+    dp4 (8)  REG2(r, nTEMP16,  0)<1>:f          ufROW_YUVA(5, 0)<5;8,1>         COEF_REG<0;4,1>:f 
+    mov.sat (2)  REG2(r, nTEMP14,  0)<1>:ud     REG2(r, nTEMP16, 0)<0;2,4>:f
+    mov (2)  REG2(r, nTEMP12, 10)<1>:ub         REG2(r, nTEMP14, 0)<0;2,4>:ub
+    
+    dp4 (8)  REG2(r, nTEMP16,  0)<1>:f          ufROW_YUVA(6, 0)<6;8,1>         COEF_REG<0;4,1>:f 
+    mov.sat (2)  REG2(r, nTEMP14,  0)<1>:ud     REG2(r, nTEMP16, 0)<0;2,4>:f
+    mov (2)  REG2(r, nTEMP12, 12)<1>:ub         REG2(r, nTEMP14, 0)<0;2,4>:ub
+    
+    dp4 (8)  REG2(r, nTEMP16,  0)<1>:f          ufROW_YUVA(7, 0)<0;8,1>         COEF_REG<0;4,1>:f 
+    mov.sat (2)  REG2(r, nTEMP14,  0)<1>:ud     REG2(r, nTEMP16, 0)<0;2,4>:f
+    mov (2)  REG2(r, nTEMP12, 14)<1>:ub         REG2(r, nTEMP14, 0)<0;2,4>:ub
+
+    // ####  write this channel
+    mov (8) ubDEST_RGBX(0,ROW_NUM_WRITE*64+CHANNEL   )<4>  REG2(r,nTEMP12, 0)<0;8,1>:ub
+    mov (8) ubDEST_RGBX(0,ROW_NUM_WRITE*64+CHANNEL+32)<4>  REG2(r,nTEMP12, 8)<0;8,1>:ub
+#if (!ONE_ROW_DEBUG)
+    }
+#endif    
diff --git a/src/shaders/post_processing/gen5_6/Common/YUV_to_RGBX_Coef.asm b/src/shaders/post_processing/gen5_6/Common/YUV_to_RGBX_Coef.asm
new file mode 100755
index 0000000..50c0a39
--- /dev/null
+++ b/src/shaders/post_processing/gen5_6/Common/YUV_to_RGBX_Coef.asm
@@ -0,0 +1,65 @@
+/*
+ * All Video Processing kernels 
+ * Copyright © <2010>, Intel Corporation.
+ *
+ * This program is licensed under the terms and conditions of the
+ * Eclipse Public License (EPL), version 1.0.  The full text of the EPL is at
+ * http://www.opensource.org/licenses/eclipse-1.0.php.
+ *
+ * Authors:
+ *    Halley Zhao <halley.zhao at intel.com>
+ */
+
+// Module name: YUV_to_RGBX_Coef.asm
+//----------------------------------------------------------------
+#define ubDEST_RGBX          ubTOP_Y       // I'd like use them for color conversion
+
+#ifdef FIX_POINT_CONVERSION
+    // ###### set up transformation coefficient
+        // R = clip((   298 * C +   0 * D +    409 * E + 128) >> 8)
+        // R = clip((0x012A * C +   0 * D + 0x0199 * E + 128) >> 8)
+    mov (1) REG2(r, nTEMP0, 0):ud       0x0000012A:ud      
+    mov (1) REG2(r, nTEMP0, 1):ud       0x00000199:ud      
+        // G = clip((    298 * C -    100 * D -    208 * E + 128) >> 8)
+        // G = clip(( 0x012A * C -   0x64 * D -   0xD0 * E + 128) >> 8)
+        // G = clip(( 0x012A * C + 0xFF9C * D + 0xFF30 * E + 128) >> 8)
+    mov (1) REG2(r, nTEMP0, 2):ud       0xFF9C012A:ud      
+    mov (1) REG2(r, nTEMP0, 3):ud       0x0000FF30:ud      
+        // B = clip((  298 * C +    516 * D +   0 * E + 128) >> 8) 
+        // B = clip((0x012A* C + 0x0204 * D +   0 * E + 128) >> 8) 
+    mov (1) REG2(r, nTEMP0, 4):ud       0x0204012A:ud
+    mov (1) REG2(r, nTEMP0, 5):ud       0x00000000:ud
+
+    // asr.sat (24) REG2(r,nTEMP0,0)<1>    REG2(r,nTEMP0,0)<0;24,1>    1:w
+    asr.sat (8) REG2(r,nTEMP0, 0)<1>:w    REG2(r,nTEMP0, 0)<0;8,1>:w    1:w
+    asr.sat (4)  REG2(r,nTEMP0,8)<1>:w    REG2(r,nTEMP0,8)<0;4,1>:w    1:w
+    
+        // C = Y' - 16          D = U - 128         E = V - 128
+    mov (1) REG2(r, nTEMP0, 6):ud       0x008080F0:ud
+#else
+        // R = Y             + 1.13983*V
+        // R = clip( Y                  + 1.402*(Cr-128))  // ITU-R
+    mov (1) REG2(r, nTEMP8, 3):f       0.000f       // A coef
+    mov (1) REG2(r, nTEMP8, 2):f       1.402f       // V coef
+    mov (1) REG2(r, nTEMP8, 1):f       0.0f         // U coef
+    mov (1) REG2(r, nTEMP8, 0):f       1.0f         // Y coef
+    
+        // G = Y - 0.39465*U - 0.58060*V
+        // G = clip( Y - 0.344*(Cb-128) - 0.714*(Cr-128))
+    mov (1) REG2(r, nTEMP8, 7):f       0.000f       // A coef
+    mov (1) REG2(r, nTEMP8, 6):f      -0.714f       // V coef
+    mov (1) REG2(r, nTEMP8, 5):f      -0.344f       // U coef
+    mov (1) REG2(r, nTEMP8, 4):f       1.0f         // Y coef
+
+        // B = Y + 2.03211*U
+        // B = clip( Y + 1.772*(Cb-128))
+    mov (1) REG2(r, nTEMP10, 3):f       0.000f      // A coef
+    mov (1) REG2(r, nTEMP10, 2):f       0.0f        // V coef
+    mov (1) REG2(r, nTEMP10, 1):f       1.772f      // U coef
+    mov (1) REG2(r, nTEMP10, 0):f       1.0f        // Y coef
+
+    mov (1) REG2(r, nTEMP10,  4):ud         0x008080F0:ud
+    
+    .declare ufROW_YUVA       Base=REG(r,nTEMP0) ElementSize=4 SrcRegion=REGION(8,8) Type=f    // r nTEMP0 - r nTEMP7
+
+#endif
diff --git a/src/shaders/post_processing/gen5_6/Makefile.am b/src/shaders/post_processing/gen5_6/Makefile.am
index 8658938..052ee53 100755
--- a/src/shaders/post_processing/gen5_6/Makefile.am
+++ b/src/shaders/post_processing/gen5_6/Makefile.am
@@ -21,6 +21,7 @@ INTEL_PP_G4B_GEN5 = \
 	pa_load_save_nv12.g4b.gen5				\
 	pa_load_save_pl3.g4b.gen5				\
 	rgbx_load_save_nv12.g4b.gen5				\
+	nv12_load_save_rgbx.g4b.gen5				\
 	$(NULL)
 
 INTEL_PP_G6B = \
@@ -37,6 +38,7 @@ INTEL_PP_G6B = \
 	pa_load_save_nv12.g6b				\
 	pa_load_save_pl3.g6b				\
 	rgbx_load_save_nv12.g6b				\
+	nv12_load_save_rgbx.g6b				\
 	$(NULL)
 
 INTEL_PP_ASM = \
@@ -53,6 +55,7 @@ INTEL_PP_ASM = \
 	pa_load_save_nv12.asm				\
 	pa_load_save_pl3.asm				\
 	rgbx_load_save_nv12.asm				\
+	nv12_load_save_rgbx.asm				\
 	$(NULL)
 
 INTEL_PP_ASM += \
@@ -93,6 +96,9 @@ INTEL_PP_ASM += \
 	Common/RGBX_to_YUV_Coef.asm			\
 	Common/RGBX_Save_YUV_Fix.asm		\
 	Common/RGBX_Save_YUV_Float.asm		\
+	Common/YUV_to_RGBX_Coef.asm			\
+	Common/YUVX_Save_RGBX_Fix.asm       \
+	Common/YUVX_Save_RGBX_Float.asm     \
 	Common/SetupVPKernel.asm			\
 	Common/readSampler16x1.asm			\
 	Core_Kernels/AVS_SetupFirstBlock.asm		\
diff --git a/src/shaders/post_processing/gen5_6/nv12_load_save_rgbx.asm b/src/shaders/post_processing/gen5_6/nv12_load_save_rgbx.asm
new file mode 100755
index 0000000..72cd96b
--- /dev/null
+++ b/src/shaders/post_processing/gen5_6/nv12_load_save_rgbx.asm
@@ -0,0 +1,25 @@
+// Module name: NV12_LOAD_SAVE_RGBX
+.kernel NV12_LOAD_SAVE_RGBX
+.code
+#define FIX_POINT_CONVERSION
+// #define FLOAT_POINT_CONVERSION
+
+#include "SetupVPKernel.asm"
+#include "YUV_to_RGBX_Coef.asm"
+#include "Multiple_Loop_Head.asm"
+#include "NV12_Load_8x4.asm"
+#ifdef FIX_POINT_CONVERSION
+  #include "YUVX_Save_RGBX_Fix.asm"
+#else
+  #include "YUVX_Save_RGBX_Float.asm"
+#endif
+#include "RGB16x8_Save_RGB.asm"
+#include "Multiple_Loop.asm"
+
+END_THREAD  // End of Thread
+
+.end_code  
+
+.end_kernel
+
+// end of nv12_load_save_rgbx.asm
-- 
1.7.4.1



More information about the Libva mailing list