[Libreoffice-commits] core.git: Branch 'feature/fixes36' - 4 commits - include/vcl vcl/opengl vcl/win

Thu Nov 3 22:22:15 UTC 2016

include/vcl/opengl/OpenGLContext.hxx    |   16 ++
 vcl/opengl/areaScaleFragmentShader.glsl |  176 +++++++++++++++++++++++++-------
 vcl/opengl/gdiimpl.cxx                  |   16 ++
 vcl/opengl/salbmp.cxx                   |   40 -------
 vcl/opengl/scale.cxx                    |   10 +
 vcl/opengl/win/gdiimpl.cxx              |    5 
 vcl/win/gdi/salgdi2.cxx                 |   67 +++++++++++-
 7 files changed, 244 insertions(+), 86 deletions(-)

New commits:
commit 8b03d8e70edcd8176f61a9ac0bb2c0a3d179bfb1
Author: Tomaž Vajngerl <tomaz.vajngerl at collabora.co.uk>
Date:   Thu Nov 3 23:11:18 2016 +0100

    opengl: reduced register areaScale shader and detection for intel
    
    Some intel drivers crash when areaScale shader with "large" array
    is used. This adds a "reduced register" version of the areaScale
    shader. We still use the first version of the shader for other
    drivers and switch between the 2 implementations with a runtime
    detection.
    
    Change-Id: I1860f898c03b40a600eb1b41f7262719382a7171

diff --git a/include/vcl/opengl/OpenGLContext.hxx b/include/vcl/opengl/OpenGLContext.hxx
index 6863467..a7cc2d6 100644
--- a/include/vcl/opengl/OpenGLContext.hxx
+++ b/include/vcl/opengl/OpenGLContext.hxx
@@ -52,6 +52,15 @@ struct VCL_DLLPUBLIC GLWindow
     virtual ~GLWindow();
 };
 
+struct VCL_DLLPUBLIC OpenGLCapabilitySwitch
+{
+    bool mbLimitedShaderRegisters;
+
+    OpenGLCapabilitySwitch()
+        : mbLimitedShaderRegisters(false)
+    {}
+};
+
 class VCL_DLLPUBLIC OpenGLContext
 {
     friend class OpenGLTests;
@@ -94,6 +103,11 @@ public:
         return mpRenderState;
     }
 
+    OpenGLCapabilitySwitch& getOpenGLCapabilitySwitch()
+    {
+        return maOpenGLCapabilitySwitch;
+    }
+
     /// Is this GL context the current context ?
     virtual bool isCurrent();
     /// Is any GL context the current context ?
@@ -165,6 +179,8 @@ protected:
     OpenGLFramebuffer* mpFirstFramebuffer;
     OpenGLFramebuffer* mpLastFramebuffer;
 
+    OpenGLCapabilitySwitch maOpenGLCapabilitySwitch;
+
 private:
     struct ProgramHash
     {
diff --git a/vcl/opengl/areaScaleFragmentShader.glsl b/vcl/opengl/areaScaleFragmentShader.glsl
index c83c5e0..e161336 100644
--- a/vcl/opengl/areaScaleFragmentShader.glsl
+++ b/vcl/opengl/areaScaleFragmentShader.glsl
@@ -7,13 +7,7 @@
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
 
-#version 120
-#if __VERSION__ < 130
-int min( int a, int b ) { return a < b ? a : b; }
-float min( float a, float b ) { return a < b ? a : b; }
-#endif
-
-/* TODO Use textureOffset for newest version of GLSL */
+#version 130
 
 uniform sampler2D sampler;
 uniform int swidth;
@@ -34,23 +28,108 @@ varying vec2 mask_coord;
 uniform sampler2D mask;
 #endif
 
+vec4 getTexel(int x, int y)
+{
+    vec2 offset = vec2(x * xsrcconvert, y * ysrcconvert);
+    vec4 texel = texture2D(sampler, offset);
+#ifdef MASKED
+    texel.a = 1.0 - texture2D(mask, offset).r;
+#endif
+    return texel;
+}
+
+#ifdef USE_REDUCED_REGISTER_VARIANT
+
+void main(void)
+{
+    // Convert to pixel coordinates again.
+    int dx = int(tex_coord.s * xdestconvert);
+    int dy = int(tex_coord.t * ydestconvert);
+
+    // Compute the range of source pixels which will make up this destination pixel.
+    float fsx1 = min(dx * xscale,   float(swidth - 1));
+    float fsx2 = min(fsx1 + xscale, float(swidth - 1));
+
+    float fsy1 = min(dy * yscale,   float(sheight - 1));
+    float fsy2 = min(fsy1 + yscale, float(sheight - 1));
+
+    // To whole pixel coordinates.
+    int xstart = int(floor(fsx1));
+    int xend   = int(floor(fsx2));
+
+    int ystart = int(floor(fsy1));
+    int yend   = int(floor(fsy2));
+
+    float xlength = fsx2 - fsx1;
+    float ylength = fsy2 - fsy1;
+
+    float xContribution[3];
+    xContribution[0] = (1.0 - max(0.0, fsx1 - xstart))     / xlength;
+    xContribution[1] =  1.0 / xlength;
+    xContribution[2] = (1.0 - max(0.0, (xend + 1) - fsx2)) / xlength;
+
+    float yContribution[3];
+    yContribution[0] = (1.0 - max(0.0, fsy1 - ystart))     / ylength;
+    yContribution[1] =  1.0 / ylength;
+    yContribution[2] = (1.0 - max(0.0, (yend + 1) - fsy2)) / ylength;
+
+    vec4 sumAll = vec4(0.0, 0.0, 0.0, 0.0);
+    vec4 texel;
+    // First Y pass
+    {
+        vec4 sumX = vec4(0.0, 0.0, 0.0, 0.0);
+
+        sumX += getTexel(xstart, ystart) * xContribution[0];
+        for (int x = xstart + 1; x < xend; ++x)
+        {
+           sumX += getTexel(x, ystart) * xContribution[1];
+        }
+        sumX += getTexel(xend, ystart) * xContribution[2];
+
+        sumAll += sumX * yContribution[0];
+    }
+
+    // Middle Y Passes
+    for (int y = ystart + 1; y < yend; ++y)
+    {
+        vec4 sumX = vec4(0.0, 0.0, 0.0, 0.0);
+
+        sumX += getTexel(xstart, y) * xContribution[0];
+        for (int x = xstart + 1; x < xend; ++x)
+        {
+            sumX += getTexel(x, y) * xContribution[1];
+        }
+        sumX += getTexel(xend, y) * xContribution[2];
+
+        sumAll += sumX * yContribution[1];
+    }
+
+    // Last Y pass
+    {
+        vec4 sumX = vec4(0.0, 0.0, 0.0, 0.0);
+
+        sumX += getTexel(xstart, yend) * xContribution[0];
+        for (int x = xstart + 1; x < xend; ++x)
+        {
+            sumX += getTexel(x, yend) * xContribution[1];
+        }
+        sumX += getTexel(xend, yend) * xContribution[2];
+
+        sumAll += sumX * yContribution[2];
+    }
+
+    gl_FragColor = sumAll;
+}
+#else
 void main(void)
 {
     // Convert to pixel coordinates again.
     int dx = int( tex_coord.s * xdestconvert );
     int dy = int( tex_coord.t * ydestconvert );
 
-    // Note: These values are always the same for the same X (or Y),
-    // so they could be precalculated in C++ and passed to the shader,
-    // but GLSL has limits on the size of uniforms passed to it,
-    // so it'd need something like texture buffer objects from newer
-    // GLSL versions, and it seems the hassle is not really worth it.
-
-    // How much each column/row will contribute to the resulting pixel.
-    // assert( xscale <= 100 ); assert( yscale <= 100 );
     float xratio[ 16 + 2 ];
     float yratio[ 16 + 2 ];
-    // For finding the first and last source pixel.
+
     int xpixel[ 16 + 2 ];
     int ypixel[ 16 + 2 ];
 
@@ -147,5 +226,5 @@ void main(void)
 
     gl_FragColor = sum;
 }
-
+#endif
 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/vcl/opengl/gdiimpl.cxx b/vcl/opengl/gdiimpl.cxx
index 6c91013..34a8b51 100644
--- a/vcl/opengl/gdiimpl.cxx
+++ b/vcl/opengl/gdiimpl.cxx
@@ -1259,7 +1259,11 @@ bool scaleTexture(const rtl::Reference< OpenGLContext > &xContext,
     int nNewWidth = nWidth / ixscale;
     int nNewHeight = nHeight / iyscale;
 
-    OpenGLProgram* pProgram = xContext->UseProgram("textureVertexShader", "areaScaleFragmentShader");
+    OString sUseReducedRegisterVariantDefine;
+    if (xContext->getOpenGLCapabilitySwitch().mbLimitedShaderRegisters)
+        sUseReducedRegisterVariantDefine = OString("#define USE_REDUCED_REGISTER_VARIANT\n");
+
+    OpenGLProgram* pProgram = xContext->UseProgram("textureVertexShader", "areaScaleFragmentShader", sUseReducedRegisterVariantDefine);
     if (pProgram == nullptr)
         return false;
 
@@ -1324,6 +1328,11 @@ void OpenGLSalGraphicsImpl::DrawTransformedTexture(
     // See OpenGLSalBitmap::ImplScaleArea().
     bool areaScaling = false;
     bool fastAreaScaling = false;
+
+    OString sUseReducedRegisterVariantDefine;
+    if (mpContext->getOpenGLCapabilitySwitch().mbLimitedShaderRegisters)
+        sUseReducedRegisterVariantDefine = OString("#define USE_REDUCED_REGISTER_VARIANT\n");
+
     OUString textureFragmentShader;
     if( ixscale >= 2 && iyscale >= 2 )  // scale ratio less than 50%
     {
@@ -1388,7 +1397,7 @@ void OpenGLSalGraphicsImpl::DrawTransformedTexture(
     {
         if( !UseProgram( "transformedTextureVertexShader",
                 textureFragmentShader.isEmpty() ? "maskedTextureFragmentShader" : textureFragmentShader,
-                "#define MASKED" ) )
+                "#define MASKED\n" + sUseReducedRegisterVariantDefine))
             return;
         mpProgram->SetTexture( "mask", aInMask );
         GLfloat aMaskCoord[8];
@@ -1400,7 +1409,8 @@ void OpenGLSalGraphicsImpl::DrawTransformedTexture(
     else
     {
         if( !UseProgram( "transformedTextureVertexShader",
-                textureFragmentShader.isEmpty() ? "textureFragmentShader" : textureFragmentShader ) )
+                textureFragmentShader.isEmpty() ? "textureFragmentShader" : textureFragmentShader,
+                sUseReducedRegisterVariantDefine))
             return;
     }
 
diff --git a/vcl/opengl/scale.cxx b/vcl/opengl/scale.cxx
index 9feb933..3e2b9d4 100644
--- a/vcl/opengl/scale.cxx
+++ b/vcl/opengl/scale.cxx
@@ -231,8 +231,14 @@ bool OpenGLSalBitmap::ImplScaleArea( const rtl::Reference< OpenGLContext > &xCon
 
     // TODO Make sure the framebuffer is alright
 
+    OString sUseReducedRegisterVariantDefine;
+    if (xContext->getOpenGLCapabilitySwitch().mbLimitedShaderRegisters)
+        sUseReducedRegisterVariantDefine = OString("#define USE_REDUCED_REGISTER_VARIANT\n");
+
     OpenGLProgram* pProgram = xContext->UseProgram( "textureVertexShader",
-        fast ? OUString( "areaScaleFastFragmentShader" ) : OUString( "areaScaleFragmentShader" ));
+        fast ? OUString( "areaScaleFastFragmentShader" ) : OUString( "areaScaleFragmentShader" ),
+        sUseReducedRegisterVariantDefine);
+
     if( pProgram == nullptr )
         return false;
 
@@ -281,7 +287,7 @@ bool OpenGLSalBitmap::ImplScaleArea( const rtl::Reference< OpenGLContext > &xCon
         ixscale = 1 / rScaleX;
         iyscale = 1 / rScaleY;
 
-        pProgram = xContext->UseProgram("textureVertexShader", "areaScaleFragmentShader");
+        pProgram = xContext->UseProgram("textureVertexShader", "areaScaleFragmentShader", sUseReducedRegisterVariantDefine);
         if (pProgram == nullptr)
             return false;
 
diff --git a/vcl/opengl/win/gdiimpl.cxx b/vcl/opengl/win/gdiimpl.cxx
index 310eb14..8d1fbea 100644
--- a/vcl/opengl/win/gdiimpl.cxx
+++ b/vcl/opengl/win/gdiimpl.cxx
@@ -633,6 +633,11 @@ bool WinOpenGLContext::ImplInit()
 
     bFirstCall = false;
 
+    static OString aVendor(reinterpret_cast<const char*>(glGetString(GL_VENDOR)));
+
+    if (aVendor.equalsIgnoreAsciiCase("intel"))
+        maOpenGLCapabilitySwitch.mbLimitedShaderRegisters = true;
+
     return true;
 }
 
commit f65f9ecf334fe7ca5ffcbfd3370909da9a88a5a6
Author: Tomaž Vajngerl <tomaz.vajngerl at collabora.co.uk>
Date:   Thu Nov 3 23:05:25 2016 +0100

    opengl: change from BGRA to RGBA color arrangement on Windows
    
    BGRA is native color arrangement on Windows however some intel
    drivers have problems with large textures if they read from a
    BGRA buffer. So with this commit we switch to RGBA color
    arrangement. This shouldn't cause much performance differences,
    but we need to convert from RGBA to BGRA when printing.
    
    Change-Id: Ic112dc6a6c5d8b70e96041d0de15a03bbbdc406f

diff --git a/vcl/opengl/salbmp.cxx b/vcl/opengl/salbmp.cxx
index ef081ee..3af6977 100644
--- a/vcl/opengl/salbmp.cxx
+++ b/vcl/opengl/salbmp.cxx
@@ -55,28 +55,16 @@ inline bool determineTextureFormat(sal_uInt16 nBits, GLenum& nFormat, GLenum& nT
         nType = GL_UNSIGNED_BYTE;
         return true;
     case 16:
-#ifdef _WIN32
-        nFormat = GL_BGR;
-#else
         nFormat = GL_RGB;
-#endif
         nType = GL_UNSIGNED_SHORT_5_6_5;
         return true;
     case 24:
-#ifdef _WIN32
-        nFormat = GL_BGR;
-#else
         nFormat = GL_RGB;
-#endif
         nType = GL_UNSIGNED_BYTE;
         return true;
     case 32:
-#ifdef _WIN32
-        nFormat = GL_BGRA;
-#else
         nFormat = GL_RGBA;
-#endif
-        nType = GL_UNSIGNED_BYTE;
+        nType = GL_UNSIGNED_INT_8_8_8_8;
         return true;
     default:
         break;
@@ -816,16 +804,6 @@ BitmapBuffer* OpenGLSalBitmap::AcquireBuffer( BitmapAccessMode nMode )
             break;
         case 16:
         {
-#ifdef _WIN32
-            pBuffer->mnFormat = ScanlineFormat::N16BitTcLsbMask;
-            ColorMaskElement aRedMask(0x00007c00);
-            aRedMask.CalcMaskShift();
-            ColorMaskElement aGreenMask(0x000003e0);
-            aGreenMask.CalcMaskShift();
-            ColorMaskElement aBlueMask(0x0000001f);
-            aBlueMask.CalcMaskShift();
-            pBuffer->maColorMask = ColorMask(aRedMask, aGreenMask, aBlueMask);
-#else
             pBuffer->mnFormat = ScanlineFormat::N16BitTcMsbMask;
             ColorMaskElement aRedMask(0x0000f800);
             aRedMask.CalcMaskShift();
@@ -834,30 +812,15 @@ BitmapBuffer* OpenGLSalBitmap::AcquireBuffer( BitmapAccessMode nMode )
             ColorMaskElement aBlueMask(0x0000001f);
             aBlueMask.CalcMaskShift();
             pBuffer->maColorMask  = ColorMask(aRedMask, aGreenMask, aBlueMask);
-#endif
             break;
         }
         case 24:
         {
-#ifdef _WIN32
-            pBuffer->mnFormat = ScanlineFormat::N24BitTcBgr;
-#else
             pBuffer->mnFormat = ScanlineFormat::N24BitTcRgb;
-#endif
             break;
         }
         case 32:
         {
-#ifdef _WIN32
-            pBuffer->mnFormat = ScanlineFormat::N32BitTcBgra;
-            ColorMaskElement aRedMask(0x00ff0000);
-            aRedMask.CalcMaskShift();
-            ColorMaskElement aGreenMask(0x0000ff00);
-            aGreenMask.CalcMaskShift();
-            ColorMaskElement aBlueMask(0x000000ff);
-            aBlueMask.CalcMaskShift();
-            pBuffer->maColorMask = ColorMask(aRedMask, aGreenMask, aBlueMask);
-#else
             pBuffer->mnFormat = ScanlineFormat::N32BitTcRgba;
             ColorMaskElement aRedMask(0xff000000);
             aRedMask.CalcMaskShift();
@@ -866,7 +829,6 @@ BitmapBuffer* OpenGLSalBitmap::AcquireBuffer( BitmapAccessMode nMode )
             ColorMaskElement aBlueMask(0x0000ff00);
             aBlueMask.CalcMaskShift();
             pBuffer->maColorMask  = ColorMask(aRedMask, aGreenMask, aBlueMask);
-#endif
             break;
         }
     }
diff --git a/vcl/win/gdi/salgdi2.cxx b/vcl/win/gdi/salgdi2.cxx
index 8514e73..751b8c9 100644
--- a/vcl/win/gdi/salgdi2.cxx
+++ b/vcl/win/gdi/salgdi2.cxx
@@ -73,6 +73,45 @@ void WinSalGraphics::copyArea( long nDestX, long nDestY,
 namespace
 {
 
+class ColorScanlineConverter
+{
+public:
+    ScanlineFormat meSourceFormat;
+    ScanlineFormat meDestinationFormat;
+
+    int mnComponentSize;
+    int mnComponentExchangeIndex;
+
+    long mnScanlineSize;
+
+    ColorScanlineConverter(ScanlineFormat eSourceFormat, ScanlineFormat eDestinationFormat, int nComponentSize, long nScanlineSize)
+        : meSourceFormat(eSourceFormat)
+        , meDestinationFormat(eDestinationFormat)
+        , mnComponentSize(nComponentSize)
+        , mnComponentExchangeIndex(0)
+        , mnScanlineSize(nScanlineSize)
+    {
+        if (meSourceFormat == ScanlineFormat::N32BitTcAbgr ||
+            meSourceFormat == ScanlineFormat::N32BitTcArgb)
+        {
+            mnComponentExchangeIndex = 1;
+        }
+    }
+
+    void convertScanline(sal_uInt8* pSource, sal_uInt8* pDestination)
+    {
+        for (int x = 0; x < mnScanlineSize; x += mnComponentSize)
+        {
+            for (int i = 0; i < mnComponentSize; ++i)
+            {
+                pDestination[x + i] = pSource[x + i];
+            }
+            pDestination[x + mnComponentExchangeIndex + 0] = pSource[x + mnComponentExchangeIndex + 2];
+            pDestination[x + mnComponentExchangeIndex + 2] = pSource[x + mnComponentExchangeIndex + 0];
+        }
+    }
+};
+
 void convertToWinSalBitmap(SalBitmap& rSalBitmap, WinSalBitmap& rWinSalBitmap)
 {
          BitmapPalette aBitmapPalette;
@@ -90,11 +129,31 @@ void convertToWinSalBitmap(SalBitmap& rSalBitmap, WinSalBitmap& rWinSalBitmap)
         sal_uInt8* pSource(pRead->mpBits);
         sal_uInt8* pDestination(pWrite->mpBits);
 
-        for (long y = 0; y < pRead->mnHeight; y++)
+        std::unique_ptr<ColorScanlineConverter> pConverter;
+
+        if (pRead->mnFormat == ScanlineFormat::N24BitTcRgb)
+            pConverter.reset(new ColorScanlineConverter(ScanlineFormat::N24BitTcRgb, ScanlineFormat::N24BitTcBgr,
+                                                        3, pRead->mnScanlineSize));
+        else if (pRead->mnFormat == ScanlineFormat::N32BitTcRgba)
+            pConverter.reset(new ColorScanlineConverter(ScanlineFormat::N32BitTcRgba, ScanlineFormat::N32BitTcBgra,
+                                                        4, pRead->mnScanlineSize));
+        if (pConverter)
+        {
+            for (long y = 0; y < pRead->mnHeight; y++)
+            {
+                pConverter->convertScanline(pSource, pDestination);
+                pSource += pRead->mnScanlineSize;
+                pDestination += pWrite->mnScanlineSize;
+            }
+        }
+        else
         {
-            memcpy(pDestination, pSource, pRead->mnScanlineSize);
-            pSource += pRead->mnScanlineSize;
-            pDestination += pWrite->mnScanlineSize;
+            for (long y = 0; y < pRead->mnHeight; y++)
+            {
+                memcpy(pDestination, pSource, pRead->mnScanlineSize);
+                pSource += pRead->mnScanlineSize;
+                pDestination += pWrite->mnScanlineSize;
+            }
         }
         rWinSalBitmap.ReleaseBuffer(pWrite, BitmapAccessMode::Write);
 
commit dc9db7e3a4fc06d8d8587d2ec4ea94134eb3ea3c
Author: Tomaž Vajngerl <tomaz.vajngerl at collabora.co.uk>
Date:   Thu Nov 3 23:20:35 2016 +0100

    Revert "opengl: array-less area scale fragment shader"
    
    This reverts commit 20bef6e93403c2b207b055870b3985af93a74d8a.

diff --git a/vcl/opengl/areaScaleFragmentShader.glsl b/vcl/opengl/areaScaleFragmentShader.glsl
index 714cb7d..c83c5e0 100644
--- a/vcl/opengl/areaScaleFragmentShader.glsl
+++ b/vcl/opengl/areaScaleFragmentShader.glsl
@@ -13,6 +13,8 @@ int min( int a, int b ) { return a < b ? a : b; }
 float min( float a, float b ) { return a < b ? a : b; }
 #endif
 
+/* TODO Use textureOffset for newest version of GLSL */
+
 uniform sampler2D sampler;
 uniform int swidth;
 uniform int sheight;
@@ -32,76 +34,118 @@ varying vec2 mask_coord;
 uniform sampler2D mask;
 #endif
 
-float calculateContribution(float fLow, float fHigh, int value)
-{
-    float start = max(0.0, fLow - value);
-    float end   = max(0.0, (value + 1) - fHigh);
-    return (1.0 - start - end) / (fHigh - fLow);
-}
-
 void main(void)
 {
     // Convert to pixel coordinates again.
-    int dx = int(tex_coord.s * xdestconvert);
-    int dy = int(tex_coord.t * ydestconvert);
+    int dx = int( tex_coord.s * xdestconvert );
+    int dy = int( tex_coord.t * ydestconvert );
+
+    // Note: These values are always the same for the same X (or Y),
+    // so they could be precalculated in C++ and passed to the shader,
+    // but GLSL has limits on the size of uniforms passed to it,
+    // so it'd need something like texture buffer objects from newer
+    // GLSL versions, and it seems the hassle is not really worth it.
+
+    // How much each column/row will contribute to the resulting pixel.
+    // assert( xscale <= 100 ); assert( yscale <= 100 );
+    float xratio[ 16 + 2 ];
+    float yratio[ 16 + 2 ];
+    // For finding the first and last source pixel.
+    int xpixel[ 16 + 2 ];
+    int ypixel[ 16 + 2 ];
+
+    int xpos = 0;
+    int ypos = 0;
 
     // Compute the range of source pixels which will make up this destination pixel.
-    float fsx1 = min(dx * xscale,   float(swidth - 1));
-    float fsx2 = min(fsx1 + xscale, float(swidth - 1));
-
-    float fsy1 = min(dy * yscale,   float(sheight - 1));
-    float fsy2 = min(fsy1 + yscale, float(sheight - 1));
-
+    float fsx1 = dx * xscale;
+    float fsx2 = fsx1 + xscale;
     // To whole pixel coordinates.
-    int xstart = int(floor(fsx1));
-    int xend   = int(floor(fsx2));
+    int sx1 = int( ceil( fsx1 ) );
+    int sx2 = int( floor( fsx2 ) );
+    // Range checking.
+    sx2 = min( sx2, swidth - 1 );
+    sx1 = min( sx1, sx2 );
+
+    // How much one full column contributes to the resulting pixel.
+    float width = min( xscale, swidth - fsx1 );
+
+    if( sx1 - fsx1 > 0.001 )
+    {   // The first column contributes only partially.
+        xpixel[ xpos ] = sx1 - 1;
+        xratio[ xpos ] = ( sx1 - fsx1 ) / width;
+        ++xpos;
+    }
+    for( int sx = sx1; sx < sx2; ++sx )
+    {   // Columns that fully contribute to the resulting pixel.
+        xpixel[ xpos ] = sx;
+        xratio[ xpos ] = 1.0 / width;
+        ++xpos;
+    }
+    if( fsx2 - sx2 > 0.001 )
+    {   // The last column contributes only partially.
+        xpixel[ xpos ] = sx2;
+        xratio[ xpos ] = min( min( fsx2 - sx2, 1.0 ) / width, 1.0 );
+        ++xpos;
+    }
 
-    int ystart = int(floor(fsy1));
-    int yend   = int(floor(fsy2));
+    // The same for Y.
+    float fsy1 = dy * yscale;
+    float fsy2 = fsy1 + yscale;
+    int sy1 = int( ceil( fsy1 ) );
+    int sy2 = int( floor( fsy2 ) );
+    sy2 = min( sy2, sheight - 1 );
+    sy1 = min( sy1, sy2 );
 
-#ifdef ARRAY_BASED
-    int posX = 0;
-    float ratio[16];
+    float height = min( yscale, sheight - fsy1 );
 
-    for (int x = xstart; x <= xend; ++x)
+    if( sy1 - fsy1 > 0.001 )
     {
-        float contributionX = calculateContribution(fsx1, fsx2, x);
-        ratio[posX] = contributionX;
-        posX++;
+        ypixel[ ypos ] = sy1 - 1;
+        yratio[ ypos ] = ( sy1 - fsy1 ) / height;
+        ++ypos;
+    }
+    for( int sy = sy1; sy < sy2; ++sy )
+    {
+        ypixel[ ypos ] = sy;
+        yratio[ ypos ] = 1.0 / height;
+        ++ypos;
+    }
+    if( fsy2 - sy2 > 0.001 )
+    {
+        ypixel[ ypos ] = sy2;
+        yratio[ ypos ] = min( min( fsy2 - sy2, 1.0 ) / height, 1.0 );
+        ++ypos;
     }
-#endif
 
-    vec4 sumAll = vec4(0.0, 0.0, 0.0, 0.0);
+    int xstart = xpixel[ 0 ];
+    int xend = xpixel[ xpos - 1 ];
+    int ystart = ypixel[ 0 ];
+    int yend = ypixel[ ypos - 1 ];
 
-    for (int y = ystart; y <= yend; ++y)
-    {
-        vec4 sumX = vec4(0.0, 0.0, 0.0, 0.0);
+    vec4 sum = vec4( 0.0, 0.0, 0.0, 0.0 );
 
-#ifdef ARRAY_BASED
-        posX = 0;
-#endif
-        for (int x = xstart; x <= xend; ++x)
+    ypos = 0;
+    for( int y = ystart; y <= yend; ++y, ++ypos )
+    {
+        vec4 tmp = vec4( 0.0, 0.0, 0.0, 0.0 );
+        xpos = 0;
+        for( int x = xstart; x <= xend; ++x, ++xpos )
         {
-#ifdef ARRAY_BASED
-            float contributionX = ratio[posX];
-            posX++;
+            vec2 offset = vec2( x * xsrcconvert, y * ysrcconvert );
+#ifndef MASKED
+            tmp += texture2D( sampler, offset ) * xratio[ xpos ];
 #else
-            float contributionX = calculateContribution(fsx1, fsx2, x);
-#endif
-            vec2 offset = vec2(x * xsrcconvert, y * ysrcconvert);
-            vec4 texel = texture2D(sampler, offset);
-#ifdef MASKED
-            texel.a = 1.0 - texture2D(mask, offset).r;
+            vec4 texel;
+            texel = texture2D( sampler, offset );
+            texel.a = 1.0 - texture2D( mask, offset ).r;
+            tmp += texel * xratio[ xpos ];
 #endif
-            sumX += texel * contributionX;
         }
-
-        float contributionY = calculateContribution(fsy1, fsy2, y);
-
-        sumAll += sumX * contributionY;
+        sum += tmp * yratio[ ypos ];
     }
 
-    gl_FragColor = sumAll;
+    gl_FragColor = sum;
 }
 
 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
commit ee12e630856f6a9645a2563b9389e2f6fa052bdc
Author: Tomaž Vajngerl <tomaz.vajngerl at collabora.co.uk>
Date:   Thu Nov 3 23:20:16 2016 +0100

    Revert "vcl: reduce computatonal complexity of area scale shader"
    
    This reverts commit 92528f87646818918e556255801636a062ed9f0f.

diff --git a/vcl/opengl/areaScaleFragmentShader.glsl b/vcl/opengl/areaScaleFragmentShader.glsl
index cf20b89..714cb7d 100644
--- a/vcl/opengl/areaScaleFragmentShader.glsl
+++ b/vcl/opengl/areaScaleFragmentShader.glsl
@@ -32,14 +32,11 @@ varying vec2 mask_coord;
 uniform sampler2D mask;
 #endif
 
-vec4 getTexel(int x, int y)
+float calculateContribution(float fLow, float fHigh, int value)
 {
-    vec2 offset = vec2(x * xsrcconvert, y * ysrcconvert);
-    vec4 texel = texture2D(sampler, offset);
-#ifdef MASKED
-    texel.a = 1.0 - texture2D(mask, offset).r;
-#endif
-    return texel;
+    float start = max(0.0, fLow - value);
+    float end   = max(0.0, (value + 1) - fHigh);
+    return (1.0 - start - end) / (fHigh - fLow);
 }
 
 void main(void)
@@ -62,68 +59,48 @@ void main(void)
     int ystart = int(floor(fsy1));
     int yend   = int(floor(fsy2));
 
-    float xlength = fsx2 - fsx1;
-    float ylength = fsy2 - fsy1;
-
-    float xStartContribution  = (1.0 - max(0.0, fsx1 - xstart))     / xlength;
-    float xMiddleContribution =  1.0 / xlength;
-    float xEndContribution    = (1.0 - max(0.0, (xend + 1) - fsx2)) / xlength;
-
-    float yStartContribution  = (1.0 - max(0.0, fsy1 - ystart))     / ylength;
-    float yMiddleContribution =  1.0 / ylength;
-    float yEndContribution    = (1.0 - max(0.0, (yend + 1) - fsy2)) / ylength;
-
-    vec4 sumAll = vec4(0.0, 0.0, 0.0, 0.0);
-
-    vec2 offset;
-    vec4 texel;
-    vec4 sumX;
-
-    // First Y pass
-    sumX = vec4(0.0, 0.0, 0.0, 0.0);
-
-    sumX += getTexel(xstart, ystart) * xStartContribution;
+#ifdef ARRAY_BASED
+    int posX = 0;
+    float ratio[16];
 
-    for (int x = xstart + 1; x < xend; ++x)
+    for (int x = xstart; x <= xend; ++x)
     {
-       sumX += getTexel(x, ystart) * xMiddleContribution;
+        float contributionX = calculateContribution(fsx1, fsx2, x);
+        ratio[posX] = contributionX;
+        posX++;
     }
+#endif
 
-    sumX += getTexel(xend, ystart) * xEndContribution;
-
-    sumAll += sumX * yStartContribution;
+    vec4 sumAll = vec4(0.0, 0.0, 0.0, 0.0);
 
-    // Middle Y Passes
-    for (int y = ystart + 1; y < yend; ++y)
+    for (int y = ystart; y <= yend; ++y)
     {
-        sumX = vec4(0.0, 0.0, 0.0, 0.0);
+        vec4 sumX = vec4(0.0, 0.0, 0.0, 0.0);
 
-        sumX += getTexel(xstart, y) * xStartContribution;
-
-        for (int x = xstart + 1; x < xend; ++x)
+#ifdef ARRAY_BASED
+        posX = 0;
+#endif
+        for (int x = xstart; x <= xend; ++x)
         {
-            sumX += getTexel(x, y) * xMiddleContribution;
+#ifdef ARRAY_BASED
+            float contributionX = ratio[posX];
+            posX++;
+#else
+            float contributionX = calculateContribution(fsx1, fsx2, x);
+#endif
+            vec2 offset = vec2(x * xsrcconvert, y * ysrcconvert);
+            vec4 texel = texture2D(sampler, offset);
+#ifdef MASKED
+            texel.a = 1.0 - texture2D(mask, offset).r;
+#endif
+            sumX += texel * contributionX;
         }
 
-        sumX += getTexel(xend, y) * xEndContribution;
-
-        sumAll += sumX * yMiddleContribution;
-    }
-
-    // Last Y pass
-    sumX = vec4(0.0, 0.0, 0.0, 0.0);
-
-    sumX += getTexel(xstart, yend) * xStartContribution;
+        float contributionY = calculateContribution(fsy1, fsy2, y);
 
-    for (int x = xstart + 1; x < xend; ++x)
-    {
-        sumX += getTexel(x, yend) * xMiddleContribution;
+        sumAll += sumX * contributionY;
     }
 
-    sumX += getTexel(xend, yend) * xEndContribution;
-
-    sumAll += sumX * yEndContribution;
-
     gl_FragColor = sumAll;
 }