[Libva] [PATCH 1/4] add GPU shader for RGB->NV12 conversion
Xiang, Haihao
haihao.xiang at intel.com
Mon Jul 16 18:45:09 PDT 2012
This shader is only for BGR->NV12, is it right ? Is there a way to
re-use this shader for other RGB format, for example, pass the RGB
format to the thread ?
+ mul (16) REG2(r, nTEMP4, 0)<1>:uw r[SRC_RGBA_OFFSET_1, %1*32 + 0]<0; 16,1>:ub REG2(r, nTEMP0, 0)<0;4,1>:ub
+ mul (16) REG2(r, nTEMP5, 0)<1>:uw r[SRC_RGBA_OFFSET_1, %1*32 + 16]<0; 16,1>:ub REG2(r, nTEMP0, 0)<0;4,1>:ub
Could you name the coefficients such as REG2(r, nTEMP0, 0) so it is more readable ?
Thanks
Haihao
> ---
> .../gen5_6/Common/RGBX_Load_16x8.asm | 57 ++
> .../gen5_6/Common/RGBX_Load_16x8.inc | 48 ++
> .../gen5_6/Common/RGBX_Save_YUV_Fix.asm | 115 ++++
> .../gen5_6/Common/RGBX_Save_YUV_Float.asm | 152 +++++
> .../gen5_6/Common/RGBX_to_YUV_Coef.asm | 43 ++
> src/shaders/post_processing/gen5_6/Makefile.am | 8 +
> .../post_processing/gen5_6/rgbx_load_save_nv12.asm | 26 +
> .../gen5_6/rgbx_load_save_nv12.g4b.gen5 | 562 +++++++++++++++++
> .../post_processing/gen5_6/rgbx_load_save_nv12.g6b | 635 ++++++++++++++++++++
> 9 files changed, 1646 insertions(+), 0 deletions(-)
> create mode 100755 src/shaders/post_processing/gen5_6/Common/RGBX_Load_16x8.asm
> create mode 100755 src/shaders/post_processing/gen5_6/Common/RGBX_Load_16x8.inc
> create mode 100755 src/shaders/post_processing/gen5_6/Common/RGBX_Save_YUV_Fix.asm
> create mode 100755 src/shaders/post_processing/gen5_6/Common/RGBX_Save_YUV_Float.asm
> create mode 100755 src/shaders/post_processing/gen5_6/Common/RGBX_to_YUV_Coef.asm
> create mode 100755 src/shaders/post_processing/gen5_6/rgbx_load_save_nv12.asm
> create mode 100644 src/shaders/post_processing/gen5_6/rgbx_load_save_nv12.g4b.gen5
> create mode 100644 src/shaders/post_processing/gen5_6/rgbx_load_save_nv12.g6b
>
> diff --git a/src/shaders/post_processing/gen5_6/Common/RGBX_Load_16x8.asm b/src/shaders/post_processing/gen5_6/Common/RGBX_Load_16x8.asm
> new file mode 100755
> index 0000000..958308a
> --- /dev/null
> +++ b/src/shaders/post_processing/gen5_6/Common/RGBX_Load_16x8.asm
> @@ -0,0 +1,57 @@
> +/*
> + * All Video Processing kernels
> + * Copyright © <2010>, Intel Corporation.
> + *
> + * This program is licensed under the terms and conditions of the
> + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
> + * http://www.opensource.org/licenses/eclipse-1.0.php.
> + *
> + */
> +
> +// Module name: RGBA_Load_16x8.asm (copied from AYUV_Load_16x8.asm)
> +//----------------------------------------------------------------
> +
> +
> +#include "RGBX_Load_16x8.inc"
> +
> +// In order to load 64x8 RGBA data (16x8 pixels), we need to divide the data
> +// into two regions and load them separately.
> +//
> +// 32 byte 32 byte
> +//|----------------|----------------|
> +//| | |
> +//| A | B |8
> +//| | |
> +//| | |
> +//|----------------|----------------|
> +
> +// Load the first 32x8 data block
> +// Packed data block should be loaded as 32x8 pixel block
> + add (2) rMSGSRC.0<1>:d wORIX<2;2,1>:w wSRC_H_ORI_OFFSET<2;2,1>:w // Source Block origin
> + shl (1) rMSGSRC.0<1>:d rMSGSRC.0<0;1,0>:w 2:w { NoDDClr } // H. block origin need to be four times larger
> + mov (1) rMSGSRC.2<1>:ud nDPR_BLOCK_SIZE_RGBA:ud { NoDDChk } // Block width and height (32x8)
> + mov (8) mMSGHDRY<1>:ud rMSGSRC<8;8,1>:ud
> + send (8) udSRC_RGBA(0)<1> mMSGHDRY udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_RGBA+nBI_CURRENT_SRC_YUV:ud
> +
> +//Load the second 32x8 data block
> +// Offset the origin X - move to next 32 colomns
> + add (1) rMSGSRC.0<1>:d rMSGSRC.0<0;1,0>:d 32:w // Increase X origin by 8
> +
> +// Size stays the same - 32x8
> + mov (8) mMSGHDRY<1>:ud rMSGSRC<8;8,1>:ud // Copy message description to message header
> + send (8) udSRC_RGBA(8)<1> mMSGHDRY udDUMMY_NULL nDATAPORT_READ nDPMR_MSGDSC+nDPR_MSG_SIZE_RGBA+nBI_CURRENT_SRC_YUV:ud
> +
> +// Give AYUV region addresses to address register
> + // a0.0 is 0x38*32, a0.1 is 0x40*32. 0x40-0x38=8 (pixel)
> + mov (1) SRC_RGBA_OFFSET<1>:ud 0x00400038*32:ud //Address registers contain starting addresses of two halves
> +
> +#if !defined(FIX_POINT_CONVERSION) && !defined(FLOAT_POINT_CONVERSION)
> + //Directly move the data to destination
> + $for(0; <nY_NUM_OF_ROWS; 1) {
> + // 8 means 8 elements, not 2=8/2 element per row.
> + mov (16) uwDEST_Y(%1)<1> r[SRC_RGBA_OFFSET,%1*32+3]<8,4>:ub // A/R
> + mov (16) uwDEST_U(%1)<1> r[SRC_RGBA_OFFSET,%1*32+2]<8,4>:ub // Y/G
> + mov (16) uwDEST_V(%1)<1> r[SRC_RGBA_OFFSET,%1*32+1]<8,4>:ub // U/B
> + }
> +#endif
> +
> diff --git a/src/shaders/post_processing/gen5_6/Common/RGBX_Load_16x8.inc b/src/shaders/post_processing/gen5_6/Common/RGBX_Load_16x8.inc
> new file mode 100755
> index 0000000..7199d64
> --- /dev/null
> +++ b/src/shaders/post_processing/gen5_6/Common/RGBX_Load_16x8.inc
> @@ -0,0 +1,48 @@
> +/*
> + * All Video Processing kernels
> + * Copyright © <2010>, Intel Corporation.
> + *
> + * This program is licensed under the terms and conditions of the
> + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
> + * http://www.opensource.org/licenses/eclipse-1.0.php.
> + *
> + */
> +#ifndef RGBA_LOAD_16X8_INC
> +#define RGBA_LOAD_16X8_INC
> +
> +// Module name: RGBA_Load_16x8.inc
> +//
> +// RGBA data are first loaded to bottom I/O REGION_2, then does color conversion from RGB to YUV
> +// finally, YUV data are stored in top I/O REGION_1 with planar format
> +
> +#undef nY_NUM_OF_ROWS
> +
> +#define nY_NUM_OF_ROWS 8 // Number of Y rows per block
> +
> +#define nDPR_BLOCK_SIZE_RGBA nBLOCK_WIDTH_32+nBLOCK_HEIGHT_8 // RGBA block size 32x8 (it is half size)
> +#define nDPR_MSG_SIZE_RGBA nRESLEN_8 // # of MRF's to hold RGBA block data (8)
> +
> +//Temporary storage for unpacked AYUV data
> +#define rUNPACK_TEMP REG(r,nTEMP0)
> +.declare udUNPACK_TEMP Base=rUNPACK_TEMP ElementSize=4 SrcRegion=<8;8,1> Type=ud //1 GRF
> +.declare ubUNPACK_TEMP Base=rUNPACK_TEMP ElementSize=1 SrcRegion=<32;32,1> Type=ub //1 GRF
> +
> +.declare ubBOT_Y_IO Base=REG(r,nBOT_Y) ElementSize=1 SrcRegion=REGION(32,1) Type=ub
> +
> +
> +#define udSRC_RGBA udBOT_Y_IO
> +#define ubSRC_RGBA ubBOT_Y_IO
> +#define nSRC_RGBA_REG nBOT_Y
> +
> +#define uwDEST_Y uwTOP_Y
> +#define uwDEST_U uwTOP_U
> +#define uwDEST_V uwTOP_V
> +
> +#define SRC_RGBA_OFFSET a0.0
> +#define SRC_RGBA_OFFSET_1 a0.0
> +#define SRC_RGBA_OFFSET_2 a0.1
> +
> +#define nSRC_REGION nREGION_1 // REGION_1 will be the source region for first kernel
> +
> +// End of RGBA_Load_16x8.inc
> +#endif
> \ No newline at end of file
> diff --git a/src/shaders/post_processing/gen5_6/Common/RGBX_Save_YUV_Fix.asm b/src/shaders/post_processing/gen5_6/Common/RGBX_Save_YUV_Fix.asm
> new file mode 100755
> index 0000000..f60a2a0
> --- /dev/null
> +++ b/src/shaders/post_processing/gen5_6/Common/RGBX_Save_YUV_Fix.asm
> @@ -0,0 +1,115 @@
> +/*
> + * All Video Processing kernels
> + * Copyright © <2010>, Intel Corporation.
> + *
> + * This program is licensed under the terms and conditions of the
> + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
> + * http://www.opensource.org/licenses/eclipse-1.0.php.
> + *
> + * Authors:
> + * Halley Zhao <halley.zhao at intel.com>
> + */
> +
> +// Module name: PL16x8_PL8x4.asm
> +//----------------------------------------------------------------
> +
> +#include "RGBX_Load_16x8.inc"
> +
> +#if (0)
> + #define nTEMP0 34 // transformation coefficient
> + #define nTEMP1 35 // one row of Y (first half register is used)
> + #define nTEMP2 36 // first half of one row
> + #define nTEMP3 37 // second half of one row
> + #define nTEMP4 38 // mul and add
> + #define nTEMP5 39 // mul and add
> + #define nTEMP6 40 // mul and add
> + #define nTEMP7 41 // mul and add
> + #define nTEMP8 42 // sum of mul
> + #define nTEMP10 44
> + #define nTEMP12 46
> + #define nTEMP14 48
> + #define nTEMP16 50
> + #define nTEMP17 51
> + #define nTEMP18 52
> +
> + #define nTEMP24 58
> +#endif
> +
> +$for(0; <nY_NUM_OF_ROWS; 1) {
> + // BGRX | B | G | R | X |
> + // ###### do on row for Y
> + // #### mul and add
> + mul (16) REG2(r, nTEMP4, 0)<1>:uw r[SRC_RGBA_OFFSET_1, %1*32 + 0]<0; 16,1>:ub REG2(r, nTEMP0, 0)<0;4,1>:ub
> + mul (16) REG2(r, nTEMP5, 0)<1>:uw r[SRC_RGBA_OFFSET_1, %1*32 + 16]<0; 16,1>:ub REG2(r, nTEMP0, 0)<0;4,1>:ub
> + mul (16) REG2(r, nTEMP6, 0)<1>:uw r[SRC_RGBA_OFFSET_2, %1*32 + 0]<0; 16,1>:ub REG2(r, nTEMP0, 0)<0;4,1>:ub
> + mul (16) REG2(r, nTEMP7, 0)<1>:uw r[SRC_RGBA_OFFSET_2, %1*32 + 16]<0; 16,1>:ub REG2(r, nTEMP0, 0)<0;4,1>:ub
> +
> + add (4) REG2(r, nTEMP4, 0)<4>:uw REG2(r, nTEMP4, 0)<0;4,4>:uw REG2(r, nTEMP4, 1)<0;4,4>:uw
> + add (4) REG2(r, nTEMP4, 0)<4>:uw REG2(r, nTEMP4, 0)<0;4,4>:uw REG2(r, nTEMP4, 2)<0;4,4>:uw
> + add (4) REG2(r, nTEMP5, 0)<4>:uw REG2(r, nTEMP5, 0)<0;4,4>:uw REG2(r, nTEMP5, 1)<0;4,4>:uw
> + add (4) REG2(r, nTEMP5, 0)<4>:uw REG2(r, nTEMP5, 0)<0;4,4>:uw REG2(r, nTEMP5, 2)<0;4,4>:uw
> + add (4) REG2(r, nTEMP6, 0)<4>:uw REG2(r, nTEMP6, 0)<0;4,4>:uw REG2(r, nTEMP6, 1)<0;4,4>:uw
> + add (4) REG2(r, nTEMP6, 0)<4>:uw REG2(r, nTEMP6, 0)<0;4,4>:uw REG2(r, nTEMP6, 2)<0;4,4>:uw
> + add (4) REG2(r, nTEMP7, 0)<4>:uw REG2(r, nTEMP7, 0)<0;4,4>:uw REG2(r, nTEMP7, 1)<0;4,4>:uw
> + add (4) REG2(r, nTEMP7, 0)<4>:uw REG2(r, nTEMP7, 0)<0;4,4>:uw REG2(r, nTEMP7, 2)<0;4,4>:uw
> +
> + // #### write Y to the 1 row
> + mov (4) REG2(r, nTEMP8, 0)<1>:uw REG2(r, nTEMP4, 0)<0; 4, 4>:uw
> + mov (4) REG2(r, nTEMP8, 4)<1>:uw REG2(r, nTEMP5, 0)<0; 4, 4>:uw
> + mov (4) REG2(r, nTEMP8, 8)<1>:uw REG2(r, nTEMP6, 0)<0; 4, 4>:uw
> + mov (4) REG2(r, nTEMP8, 12)<1>:uw REG2(r, nTEMP7, 0)<0; 4, 4>:uw
> + add (16) REG2(r, nTEMP8, 0)<1>:uw REG2(r, nTEMP8, 0)<0; 16, 1>:uw 0x1080:uw
> + mov (16) REG2(r, nTEMP8, 0)<1>:ub REG2(r, nTEMP8, 1)<0; 16, 2>:ub
> + mov (16) uwDEST_Y(%1)<1> REG2(r,nTEMP8, 0)<0;16,1>:ub
> +
> + // ###### do one row for U
> + // #### mul and add
> + mul (16) REG2(r, nTEMP4, 0)<1>:w r[SRC_RGBA_OFFSET_1, %1*32 + 0]<0; 16,1>:ub REG2(r, nTEMP0, 4)<0;4,1>:b
> + mul (16) REG2(r, nTEMP5, 0)<1>:w r[SRC_RGBA_OFFSET_1, %1*32 + 16]<0; 16,1>:ub REG2(r, nTEMP0, 4)<0;4,1>:b
> + mul (16) REG2(r, nTEMP6, 0)<1>:w r[SRC_RGBA_OFFSET_2, %1*32 + 0]<0; 16,1>:ub REG2(r, nTEMP0, 4)<0;4,1>:b
> + mul (16) REG2(r, nTEMP7, 0)<1>:w r[SRC_RGBA_OFFSET_2, %1*32 + 16]<0; 16,1>:ub REG2(r, nTEMP0, 4)<0;4,1>:b
> +
> + add (4) REG2(r, nTEMP4, 0)<4>:w REG2(r, nTEMP4, 0)<0;4,4>:w REG2(r, nTEMP4, 1)<0;4,4>:w
> + add (4) REG2(r, nTEMP4, 0)<4>:w REG2(r, nTEMP4, 0)<0;4,4>:w REG2(r, nTEMP4, 2)<0;4,4>:w
> + add (4) REG2(r, nTEMP5, 0)<4>:w REG2(r, nTEMP5, 0)<0;4,4>:w REG2(r, nTEMP5, 1)<0;4,4>:w
> + add (4) REG2(r, nTEMP5, 0)<4>:w REG2(r, nTEMP5, 0)<0;4,4>:w REG2(r, nTEMP5, 2)<0;4,4>:w
> + add (4) REG2(r, nTEMP6, 0)<4>:w REG2(r, nTEMP6, 0)<0;4,4>:w REG2(r, nTEMP6, 1)<0;4,4>:w
> + add (4) REG2(r, nTEMP6, 0)<4>:w REG2(r, nTEMP6, 0)<0;4,4>:w REG2(r, nTEMP6, 2)<0;4,4>:w
> + add (4) REG2(r, nTEMP7, 0)<4>:w REG2(r, nTEMP7, 0)<0;4,4>:w REG2(r, nTEMP7, 1)<0;4,4>:w
> + add (4) REG2(r, nTEMP7, 0)<4>:w REG2(r, nTEMP7, 0)<0;4,4>:w REG2(r, nTEMP7, 2)<0;4,4>:w
> +
> + // #### write U to the 1 row
> + mov (4) REG2(r, nTEMP8, 0)<1>:w REG2(r, nTEMP4, 0)<0; 4, 4>:w
> + mov (4) REG2(r, nTEMP8, 4)<1>:w REG2(r, nTEMP5, 0)<0; 4, 4>:w
> + mov (4) REG2(r, nTEMP8, 8)<1>:w REG2(r, nTEMP6, 0)<0; 4, 4>:w
> + mov (4) REG2(r, nTEMP8, 12)<1>:w REG2(r, nTEMP7, 0)<0; 4, 4>:w
> + add (16) REG2(r, nTEMP8, 0)<1>:uw REG2(r, nTEMP8, 0)<0; 16, 1>:w 0x8080:uw // ok?
> + mov (16) REG2(r, nTEMP8, 0)<1>:ub REG2(r, nTEMP8, 1)<0; 16, 2>:ub
> + mov (16) uwDEST_U(%1)<1> REG2(r,nTEMP8, 0)<0;16,1>:ub
> +
> + // ###### do one row for V
> + // #### mul and add
> + mul (16) REG2(r, nTEMP4, 0)<1>:w r[SRC_RGBA_OFFSET_1, %1*32 + 0]<0; 16,1>:ub REG2(r, nTEMP0, 8)<0;4,1>:b
> + mul (16) REG2(r, nTEMP5, 0)<1>:w r[SRC_RGBA_OFFSET_1, %1*32 + 16]<0; 16,1>:ub REG2(r, nTEMP0, 8)<0;4,1>:b
> + mul (16) REG2(r, nTEMP6, 0)<1>:w r[SRC_RGBA_OFFSET_2, %1*32 + 0]<0; 16,1>:ub REG2(r, nTEMP0, 8)<0;4,1>:b
> + mul (16) REG2(r, nTEMP7, 0)<1>:w r[SRC_RGBA_OFFSET_2, %1*32 + 16]<0; 16,1>:ub REG2(r, nTEMP0, 8)<0;4,1>:b
> +
> + add (4) REG2(r, nTEMP4, 0)<4>:w REG2(r, nTEMP4, 0)<0;4,4>:w REG2(r, nTEMP4, 1)<0;4,4>:w
> + add (4) REG2(r, nTEMP4, 0)<4>:w REG2(r, nTEMP4, 0)<0;4,4>:w REG2(r, nTEMP4, 2)<0;4,4>:w
> + add (4) REG2(r, nTEMP5, 0)<4>:w REG2(r, nTEMP5, 0)<0;4,4>:w REG2(r, nTEMP5, 1)<0;4,4>:w
> + add (4) REG2(r, nTEMP5, 0)<4>:w REG2(r, nTEMP5, 0)<0;4,4>:w REG2(r, nTEMP5, 2)<0;4,4>:w
> + add (4) REG2(r, nTEMP6, 0)<4>:w REG2(r, nTEMP6, 0)<0;4,4>:w REG2(r, nTEMP6, 1)<0;4,4>:w
> + add (4) REG2(r, nTEMP6, 0)<4>:w REG2(r, nTEMP6, 0)<0;4,4>:w REG2(r, nTEMP6, 2)<0;4,4>:w
> + add (4) REG2(r, nTEMP7, 0)<4>:w REG2(r, nTEMP7, 0)<0;4,4>:w REG2(r, nTEMP7, 1)<0;4,4>:w
> + add (4) REG2(r, nTEMP7, 0)<4>:w REG2(r, nTEMP7, 0)<0;4,4>:w REG2(r, nTEMP7, 2)<0;4,4>:w
> +
> + // #### write V to the 1 row
> + mov (4) REG2(r, nTEMP8, 0)<1>:w REG2(r, nTEMP4, 0)<0; 4, 4>:w
> + mov (4) REG2(r, nTEMP8, 4)<1>:w REG2(r, nTEMP5, 0)<0; 4, 4>:w
> + mov (4) REG2(r, nTEMP8, 8)<1>:w REG2(r, nTEMP6, 0)<0; 4, 4>:w
> + mov (4) REG2(r, nTEMP8, 12)<1>:w REG2(r, nTEMP7, 0)<0; 4, 4>:w
> + add (16) REG2(r, nTEMP8, 0)<1>:uw REG2(r, nTEMP8, 0)<0; 16, 1>:w 0x8080:uw // ok?
> + mov (16) REG2(r, nTEMP8, 0)<1>:ub REG2(r, nTEMP8, 1)<0; 16, 2>:ub
> + mov (16) uwDEST_V(%1)<1> REG2(r,nTEMP8, 0)<0;16,1>:ub
> +}
> +
> diff --git a/src/shaders/post_processing/gen5_6/Common/RGBX_Save_YUV_Float.asm b/src/shaders/post_processing/gen5_6/Common/RGBX_Save_YUV_Float.asm
> new file mode 100755
> index 0000000..a771187
> --- /dev/null
> +++ b/src/shaders/post_processing/gen5_6/Common/RGBX_Save_YUV_Float.asm
> @@ -0,0 +1,152 @@
> +/*
> + * All Video Processing kernels
> + * Copyright © <2010>, Intel Corporation.
> + *
> + * This program is licensed under the terms and conditions of the
> + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
> + * http://www.opensource.org/licenses/eclipse-1.0.php.
> + *
> + * Authors:
> + * Halley Zhao <halley.zhao at intel.com>
> + */
> +
> +// Module name: RGBX_Save_YUV_Float.asm
> +//----------------------------------------------------------------
> +
> +#include "RGBX_Load_16x8.inc"
> +
> +#if (0)
> + // 8 grf reg for one row of pixel (2 pixel per grf)
> + #define nTEMP0 34
> + #define nTEMP1 35
> + #define nTEMP2 36
> + #define nTEMP3 37
> + #define nTEMP4 38
> + #define nTEMP5 39
> + #define nTEMP6 40
> + #define nTEMP7 41
> +
> + #define nTEMP8 42 // transformation coefficient
> + #define nTEMP10 44 // transformation coefficient
> +
> + #define nTEMP12 46 // save Y/U/V in ub format
> + #define nTEMP14 48 // save YUV in ud format
> + #define nTEMP16 50 // dp4 result
> + #define nTEMP17 51
> + #define nTEMP18 52
> +
> + #define nTEMP24 58
> +#endif
> +
> +$for(0; <nY_NUM_OF_ROWS; 1) {
> + // BGRX | B | G | R | X |
> + // ###### save one row of pixel to temp grf with float format (required by dp4)
> + // mov (8) doesn't work, puzzle
> + mov (4) REG(r, nTEMP0)<1>:f r[SRC_RGBA_OFFSET_1,%1*32 + 0]<4,1>:ub
> + mov (4) REG(r, nTEMP1)<1>:f r[SRC_RGBA_OFFSET_1,%1*32 + 8]<4,1>:ub
> + mov (4) REG(r, nTEMP2)<1>:f r[SRC_RGBA_OFFSET_1,%1*32 + 16]<4,1>:ub
> + mov (4) REG(r, nTEMP3)<1>:f r[SRC_RGBA_OFFSET_1,%1*32 + 24]<4,1>:ub
> + mov (4) REG(r, nTEMP4)<1>:f r[SRC_RGBA_OFFSET_2,%1*32 + 0]<4,1>:ub
> + mov (4) REG(r, nTEMP5)<1>:f r[SRC_RGBA_OFFSET_2,%1*32 + 8]<4,1>:ub
> + mov (4) REG(r, nTEMP6)<1>:f r[SRC_RGBA_OFFSET_2,%1*32 + 16]<4,1>:ub
> + mov (4) REG(r, nTEMP7)<1>:f r[SRC_RGBA_OFFSET_2,%1*32 + 24]<4,1>:ub
> + mov (4) REG2(r, nTEMP0, 4)<1>:f r[SRC_RGBA_OFFSET_1,%1*32 + 4]<4,1>:ub
> + mov (4) REG2(r, nTEMP1, 4)<1>:f r[SRC_RGBA_OFFSET_1,%1*32 + 12]<4,1>:ub
> + mov (4) REG2(r, nTEMP2, 4)<1>:f r[SRC_RGBA_OFFSET_1,%1*32 + 20]<4,1>:ub
> + mov (4) REG2(r, nTEMP3, 4)<1>:f r[SRC_RGBA_OFFSET_1,%1*32 + 28]<4,1>:ub
> + mov (4) REG2(r, nTEMP4, 4)<1>:f r[SRC_RGBA_OFFSET_2,%1*32 + 4]<4,1>:ub
> + mov (4) REG2(r, nTEMP5, 4)<1>:f r[SRC_RGBA_OFFSET_2,%1*32 + 12]<4,1>:ub
> + mov (4) REG2(r, nTEMP6, 4)<1>:f r[SRC_RGBA_OFFSET_2,%1*32 + 20]<4,1>:ub
> + mov (4) REG2(r, nTEMP7, 4)<1>:f r[SRC_RGBA_OFFSET_2,%1*32 + 24]<4,1>:ub
> +
> + // ###### do one row for Y
> + // ##### dp4(nTEMP16) and save result to uw format(nTEMP12)
> + dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP0, 0)<0;8,1>:f REG2(r, nTEMP8, 0)<0;4,1>:f
> + mov (2) REG2(r, nTEMP14, 0)<1>:ud REG2(r, nTEMP16, 0)<0;2,4>:f
> + mov (2) REG2(r, nTEMP12, 0)<1>:ub REG2(r, nTEMP14, 0)<0;2,4>:ub
> + dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP1, 0)<0;8,1>:f REG2(r, nTEMP8, 0)<0;4,1>:f
> + mov (2) REG2(r, nTEMP14, 0)<1>:ud REG2(r, nTEMP16, 0)<0;2,4>:f
> + mov (2) REG2(r, nTEMP12, 2)<1>:ub REG2(r, nTEMP14, 0)<0;2,4>:ub
> + dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP2, 0)<0;8,1>:f REG2(r, nTEMP8, 0)<0;4,1>:f
> + mov (2) REG2(r, nTEMP14, 0)<1>:ud REG2(r, nTEMP16, 0)<0;2,4>:f
> + mov (2) REG2(r, nTEMP12, 4)<1>:ub REG2(r, nTEMP14, 0)<0;2,4>:ub
> + dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP3, 0)<0;8,1>:f REG2(r, nTEMP8, 0)<0;4,1>:f
> + mov (2) REG2(r, nTEMP14, 0)<1>:ud REG2(r, nTEMP16, 0)<0;2,4>:f
> + mov (2) REG2(r, nTEMP12, 6)<1>:ub REG2(r, nTEMP14, 0)<0;2,4>:ub
> + dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP4, 0)<0;8,1>:f REG2(r, nTEMP8, 0)<0;4,1>:f
> + mov (2) REG2(r, nTEMP14, 0)<1>:ud REG2(r, nTEMP16, 0)<0;2,4>:f
> + mov (2) REG2(r, nTEMP12, 8)<1>:ub REG2(r, nTEMP14, 0)<0;2,4>:ub
> + dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP5, 0)<0;8,1>:f REG2(r, nTEMP8, 0)<0;4,1>:f
> + mov (2) REG2(r, nTEMP14, 0)<1>:ud REG2(r, nTEMP16, 0)<0;2,4>:f
> + mov (2) REG2(r, nTEMP12, 10)<1>:ub REG2(r, nTEMP14, 0)<0;2,4>:ub
> + dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP6, 0)<0;8,1>:f REG2(r, nTEMP8, 0)<0;4,1>:f
> + mov (2) REG2(r, nTEMP14, 0)<1>:ud REG2(r, nTEMP16, 0)<0;2,4>:f
> + mov (2) REG2(r, nTEMP12, 12)<1>:ub REG2(r, nTEMP14, 0)<0;2,4>:ub
> + dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP7, 0)<0;8,1>:f REG2(r, nTEMP8, 0)<0;4,1>:f
> + mov (2) REG2(r, nTEMP14, 0)<1>:ud REG2(r, nTEMP16, 0)<0;2,4>:f
> + mov (2) REG2(r, nTEMP12, 14)<1>:ub REG2(r, nTEMP14, 0)<0;2,4>:ub
> +
> + // #### write Y to the 1 row
> + mov (16) uwDEST_Y(%1)<1> REG2(r,nTEMP12, 0)<0;16,1>:ub
> +
> + // ###### do one row for U
> + // ##### dp4(nTEMP16) and save result to uw format(nTEMP12)
> + dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP0, 0)<0;8,1>:f REG2(r, nTEMP8, 4)<0;4,1>:f
> + mov (2) REG2(r, nTEMP14, 0)<1>:d REG2(r, nTEMP16, 0)<0;2,4>:f
> + mov (2) REG2(r, nTEMP12, 0)<1>:w REG2(r, nTEMP14, 0)<0;2,2>:w
> + dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP1, 0)<0;8,1>:f REG2(r, nTEMP8, 4)<0;4,1>:f
> + mov (2) REG2(r, nTEMP14, 0)<1>:d REG2(r, nTEMP16, 0)<0;2,4>:f
> + mov (2) REG2(r, nTEMP12, 2)<1>:w REG2(r, nTEMP14, 0)<0;2,2>:w
> + dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP2, 0)<0;8,1>:f REG2(r, nTEMP8, 4)<0;4,1>:f
> + mov (2) REG2(r, nTEMP14, 0)<1>:d REG2(r, nTEMP16, 0)<0;2,4>:f
> + mov (2) REG2(r, nTEMP12, 4)<1>:w REG2(r, nTEMP14, 0)<0;2,2>:w
> + dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP3, 0)<0;8,1>:f REG2(r, nTEMP8, 4)<0;4,1>:f
> + mov (2) REG2(r, nTEMP14, 0)<1>:d REG2(r, nTEMP16, 0)<0;2,4>:f
> + mov (2) REG2(r, nTEMP12, 6)<1>:w REG2(r, nTEMP14, 0)<0;2,2>:w
> + dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP4, 0)<0;8,1>:f REG2(r, nTEMP8, 4)<0;4,1>:f
> + mov (2) REG2(r, nTEMP14, 0)<1>:d REG2(r, nTEMP16, 0)<0;2,4>:f
> + mov (2) REG2(r, nTEMP12, 8)<1>:w REG2(r, nTEMP14, 0)<0;2,2>:w
> + dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP5, 0)<0;8,1>:f REG2(r, nTEMP8, 4)<0;4,1>:f
> + mov (2) REG2(r, nTEMP14, 0)<1>:d REG2(r, nTEMP16, 0)<0;2,4>:f
> + mov (2) REG2(r, nTEMP12, 10)<1>:w REG2(r, nTEMP14, 0)<0;2,2>:w
> + dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP6, 0)<0;8,1>:f REG2(r, nTEMP8, 4)<0;4,1>:f
> + mov (2) REG2(r, nTEMP14, 0)<1>:d REG2(r, nTEMP16, 0)<0;2,4>:f
> + mov (2) REG2(r, nTEMP12, 12)<1>:w REG2(r, nTEMP14, 0)<0;2,2>:w
> + dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP7, 0)<0;8,1>:f REG2(r, nTEMP8, 4)<0;4,1>:f
> + mov (2) REG2(r, nTEMP14, 0)<1>:d REG2(r, nTEMP16, 0)<0;2,4>:f
> + mov (2) REG2(r, nTEMP12, 14)<1>:w REG2(r, nTEMP14, 0)<0;2,2>:w
> + add (16) REG2(r, nTEMP12, 0)<1>:w REG2(r, nTEMP12, 0)<0;16,1>:w 128:w
> + // #### write U to the 1 row
> + mov (16) uwDEST_U(%1)<1> REG2(r,nTEMP12, 0)<0;16,2>:ub
> +
> + // ###### do one row for V
> + // ##### dp4(nTEMP16) and save result to uw format(nTEMP12)
> + dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP0, 0)<0;8,1>:f REG2(r, nTEMP10, 0)<0;4,1>:f
> + mov (2) REG2(r, nTEMP14, 0)<1>:d REG2(r, nTEMP16, 0)<0;2,4>:f
> + mov (2) REG2(r, nTEMP12, 0)<1>:w REG2(r, nTEMP14, 0)<0;2,2>:w
> + dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP1, 0)<0;8,1>:f REG2(r, nTEMP10, 0)<0;4,1>:f
> + mov (2) REG2(r, nTEMP14, 0)<1>:d REG2(r, nTEMP16, 0)<0;2,4>:f
> + mov (2) REG2(r, nTEMP12, 2)<1>:w REG2(r, nTEMP14, 0)<0;2,2>:w
> + dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP2, 0)<0;8,1>:f REG2(r, nTEMP10, 0)<0;4,1>:f
> + mov (2) REG2(r, nTEMP14, 0)<1>:d REG2(r, nTEMP16, 0)<0;2,4>:f
> + mov (2) REG2(r, nTEMP12, 4)<1>:w REG2(r, nTEMP14, 0)<0;2,2>:w
> + dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP3, 0)<0;8,1>:f REG2(r, nTEMP10, 0)<0;4,1>:f
> + mov (2) REG2(r, nTEMP14, 0)<1>:d REG2(r, nTEMP16, 0)<0;2,4>:f
> + mov (2) REG2(r, nTEMP12, 6)<1>:w REG2(r, nTEMP14, 0)<0;2,2>:w
> + dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP4, 0)<0;8,1>:f REG2(r, nTEMP10, 0)<0;4,1>:f
> + mov (2) REG2(r, nTEMP14, 0)<1>:d REG2(r, nTEMP16, 0)<0;2,4>:f
> + mov (2) REG2(r, nTEMP12, 8)<1>:w REG2(r, nTEMP14, 0)<0;2,2>:w
> + dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP5, 0)<0;8,1>:f REG2(r, nTEMP10, 0)<0;4,1>:f
> + mov (2) REG2(r, nTEMP14, 0)<1>:d REG2(r, nTEMP16, 0)<0;2,4>:f
> + mov (2) REG2(r, nTEMP12, 10)<1>:w REG2(r, nTEMP14, 0)<0;2,2>:w
> + dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP6, 0)<0;8,1>:f REG2(r, nTEMP10, 0)<0;4,1>:f
> + mov (2) REG2(r, nTEMP14, 0)<1>:d REG2(r, nTEMP16, 0)<0;2,4>:f
> + mov (2) REG2(r, nTEMP12, 12)<1>:w REG2(r, nTEMP14, 0)<0;2,2>:w
> + dp4 (8) REG2(r, nTEMP16, 0)<1>:f REG2(r, nTEMP7, 0)<0;8,1>:f REG2(r, nTEMP10, 0)<0;4,1>:f
> + mov (2) REG2(r, nTEMP14, 0)<1>:d REG2(r, nTEMP16, 0)<0;2,4>:f
> + mov (2) REG2(r, nTEMP12, 14)<1>:w REG2(r, nTEMP14, 0)<0;2,2>:w
> + add (16) REG2(r, nTEMP12, 0)<1>:w REG2(r, nTEMP12, 0)<0;16,1>:w 128:w
> +
> + // #### write V to the 1 row
> + mov (16) uwDEST_V(%1)<1> REG2(r,nTEMP12, 0)<0;16,2>:ub
> +}
> diff --git a/src/shaders/post_processing/gen5_6/Common/RGBX_to_YUV_Coef.asm b/src/shaders/post_processing/gen5_6/Common/RGBX_to_YUV_Coef.asm
> new file mode 100755
> index 0000000..1f58643
> --- /dev/null
> +++ b/src/shaders/post_processing/gen5_6/Common/RGBX_to_YUV_Coef.asm
> @@ -0,0 +1,43 @@
> +/*
> + * All Video Processing kernels
> + * Copyright © <2010>, Intel Corporation.
> + *
> + * This program is licensed under the terms and conditions of the
> + * Eclipse Public License (EPL), version 1.0. The full text of the EPL is at
> + * http://www.opensource.org/licenses/eclipse-1.0.php.
> + *
> + * Authors:
> + * Halley Zhao <halley.zhao at intel.com>
> + */
> +
> +// Module name: RGB_to_YUV_Coef.asm
> +//----------------------------------------------------------------
> +
> +#ifdef FIX_POINT_CONVERSION
> + // Y = ( ( 66 * R + 129 * G + 25 * B + 128 ) >> 8) + 16
> + mov (1) REG2(r, nTEMP0, 0):ud 0x00428119:ud // used as unsigned byte
> + // U = ( ( -38 * R - 74 * G + 112 * B + 128 ) >> 8) + 128
> + mov (1) REG2(r, nTEMP0, 1):ud 0x00DAB670:ud // used as signed byte
> + // V = ( ( 112 * R - 94 * G - 18 * B + 128 ) >> 8) + 128
> + mov (1) REG2(r, nTEMP0, 2):ud 0x0070A2EEud // used as signed byte
> +#else
> + // Y = 0.299R + 0.587G + 0.114B
> + mov (1) REG2(r, nTEMP8, 0):f 0.114f // B coef
> + mov (1) REG2(r, nTEMP8, 1):f 0.587f // G coef
> + mov (1) REG2(r, nTEMP8, 2):f 0.299f // R coef
> + mov (1) REG2(r, nTEMP8, 3):f 0.000f // A coef
> +
> + // Cb= -0.169R - 0.331G + 0.499B + 128
> + // U = -0.147R - 0.289G + 0.436B + 128
> + mov (1) REG2(r, nTEMP8, 4):f 0.436f // B coef
> + mov (1) REG2(r, nTEMP8, 5):f -0.289f // G coef
> + mov (1) REG2(r, nTEMP8, 6):f -0.147f // R coef
> + mov (1) REG2(r, nTEMP8, 7):f 0.000f // A coef
> + // Cr= 0.499R - 0.418G - 0.0813B+ 128
> + // V = 0.615R - 0.515G - 0.100B + 128
> + mov (1) REG2(r, nTEMP10, 0):f -0.100f // B coef
> + mov (1) REG2(r, nTEMP10, 1):f -0.515f // G coef
> + mov (1) REG2(r, nTEMP10, 2):f 0.615f // R coef
> + mov (1) REG2(r, nTEMP10, 3):f 0.000f // A coef
> +#endif
> +
> diff --git a/src/shaders/post_processing/gen5_6/Makefile.am b/src/shaders/post_processing/gen5_6/Makefile.am
> index 1cc1ecb..8658938 100755
> --- a/src/shaders/post_processing/gen5_6/Makefile.am
> +++ b/src/shaders/post_processing/gen5_6/Makefile.am
> @@ -20,6 +20,7 @@ INTEL_PP_G4B_GEN5 = \
> pl3_load_save_pa.g4b.gen5 \
> pa_load_save_nv12.g4b.gen5 \
> pa_load_save_pl3.g4b.gen5 \
> + rgbx_load_save_nv12.g4b.gen5 \
> $(NULL)
>
> INTEL_PP_G6B = \
> @@ -35,6 +36,7 @@ INTEL_PP_G6B = \
> pl3_load_save_pa.g6b \
> pa_load_save_nv12.g6b \
> pa_load_save_pl3.g6b \
> + rgbx_load_save_nv12.g6b \
> $(NULL)
>
> INTEL_PP_ASM = \
> @@ -50,6 +52,7 @@ INTEL_PP_ASM = \
> pl3_load_save_pa.asm \
> pa_load_save_nv12.asm \
> pa_load_save_pl3.asm \
> + rgbx_load_save_nv12.asm \
> $(NULL)
>
> INTEL_PP_ASM += \
> @@ -86,6 +89,10 @@ INTEL_PP_ASM += \
> Common/RGB16x8_Save_RGB16.asm \
> Common/RGB16x8_Save_Y416.asm \
> Common/RGB_Pack.asm \
> + Common/RGBX_Load_16x8.asm \
> + Common/RGBX_to_YUV_Coef.asm \
> + Common/RGBX_Save_YUV_Fix.asm \
> + Common/RGBX_Save_YUV_Float.asm \
> Common/SetupVPKernel.asm \
> Common/readSampler16x1.asm \
> Core_Kernels/AVS_SetupFirstBlock.asm \
> @@ -145,6 +152,7 @@ INTEL_PP_INC = \
> Common/RGB16x8_Save_RGB.inc \
> Common/RGB16x8_Save_RGB16.inc \
> Common/RGB16x8_Save_Y416.inc \
> + Common/RGBX_Load_16x8.inc \
> Common/common.inc \
> Common/undefall.inc \
> Core_Kernels/AVS_IEF.inc \
> diff --git a/src/shaders/post_processing/gen5_6/rgbx_load_save_nv12.asm b/src/shaders/post_processing/gen5_6/rgbx_load_save_nv12.asm
> new file mode 100755
> index 0000000..4922cc7
> --- /dev/null
> +++ b/src/shaders/post_processing/gen5_6/rgbx_load_save_nv12.asm
> @@ -0,0 +1,26 @@
> +// Module name: RGBX_LOAD_SAVE_NV12
> +.kernel RGBX_LOAD_SAVE_NV12
> +.code
> +#define FIX_POINT_CONVERSION
> +// #define FLOAT_POINT_CONVERSION
> +
> +#include "SetupVPKernel.asm"
> +#include "RGBX_to_YUV_Coef.asm"
> +#include "Multiple_Loop_Head.asm"
> +#include "RGBX_Load_16x8.asm"
> +#ifdef FIX_POINT_CONVERSION
> + #include "RGBX_Save_YUV_Fix.asm"
> +#else
> + #include "RGBX_Save_YUV_Float.asm"
> +#endif
> +#include "PL16x8_PL8x4.asm"
> +#include "PL8x4_Save_NV12.asm"
> +#include "Multiple_Loop.asm"
> +
> +END_THREAD // End of Thread
> +
> +.end_code
> +
> +.end_kernel
> +
> +// end of rgbx_load_save_nv12.asm
More information about the Libva
mailing list