From: Nanley Chery Date: Thu, 28 Jul 2022 00:01:05 +0000 (-0700) Subject: glsl: Add compute shaders to encode DXT5/BC3 X-Git-Tag: upstream/23.3.3~13379 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=96cb3ba4245f9daf6fb94d875caa3e411546a9b9;p=platform%2Fupstream%2Fmesa.git glsl: Add compute shaders to encode DXT5/BC3 These compute shaders are from the MIT-licensed GPU compressor, Betsy. I have included copyright headers, inlined the __sharedOnlyBarrier macro definition from the "UavCrossPlatform_piece_all.glsl" header when applicable, and made the following changes to support GLES: * Conditionally disable the const keyword in the BC3 shaders * Make the params uniform in the BC4 shader uint2 * Avoid implicit data type conversions in the BC3 shaders * Use constructors for array initialization in the BC1 shader * Add precision qualifiers to the BC3 shaders * Output to an rgba16ui image for the BC1 and BC4 shaders * Set the version of the BC3 shaders to 310 es Ref: https://github.com/darksylinc/betsy/tree/cc723dcae9 Reviewed-by: Tapani Pälli Part-of: --- diff --git a/src/compiler/glsl/CrossPlatformSettings_piece_all.glsl b/src/compiler/glsl/CrossPlatformSettings_piece_all.glsl new file mode 100644 index 0000000..7ef940a --- /dev/null +++ b/src/compiler/glsl/CrossPlatformSettings_piece_all.glsl @@ -0,0 +1,98 @@ +/* + * Copyright 2020-2022 Matias N. Goldberg + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + + +#define min3( a, b, c ) min( a, min( b, c ) ) +#define max3( a, b, c ) max( a, max( b, c ) ) + +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 + +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 + +#define uint2 uvec2 +#define uint3 uvec3 +#define uint4 uvec4 + +#define float2x2 mat2 +#define float3x3 mat3 +#define float4x4 mat4 +#define ogre_float4x3 mat3x4 + +#define ushort uint +#define ushort3 uint3 +#define ushort4 uint4 + +//Short used for read operations. It's an int in GLSL & HLSL. An ushort in Metal +#define rshort int +#define rshort2 int2 +#define rint int +//Short used for write operations. It's an int in GLSL. An ushort in HLSL & Metal +#define wshort2 int2 +#define wshort3 int3 + +#define toFloat3x3( x ) mat3( x ) +#define buildFloat3x3( row0, row1, row2 ) mat3( row0, row1, row2 ) + +#define mul( x, y ) ((x) * (y)) +#define saturate(x) clamp( (x), 0.0, 1.0 ) +#define lerp mix +#define rsqrt inversesqrt +#define INLINE +#define NO_INTERPOLATION_PREFIX flat +#define NO_INTERPOLATION_SUFFIX + +#define PARAMS_ARG_DECL +#define PARAMS_ARG + +#define reversebits bitfieldReverse + +#define OGRE_Sample( tex, sampler, uv ) texture( tex, uv ) +#define OGRE_SampleLevel( tex, sampler, uv, lod ) textureLod( tex, uv, lod ) +#define OGRE_SampleArray2D( tex, sampler, uv, arrayIdx ) texture( tex, vec3( uv, arrayIdx ) ) +#define OGRE_SampleArray2DLevel( tex, sampler, uv, arrayIdx, lod ) textureLod( tex, vec3( uv, arrayIdx ), lod ) +#define OGRE_SampleArrayCubeLevel( tex, sampler, uv, arrayIdx, lod ) textureLod( tex, vec4( uv, arrayIdx ), lod ) +#define OGRE_SampleGrad( tex, sampler, uv, ddx, ddy ) textureGrad( tex, uv, ddx, ddy ) +#define OGRE_SampleArray2DGrad( tex, sampler, uv, arrayIdx, ddx, ddy ) textureGrad( tex, vec3( uv, arrayIdx ), ddx, ddy ) +#define OGRE_ddx( val ) dFdx( val ) +#define OGRE_ddy( val ) dFdy( val ) +#define OGRE_Load2D( tex, iuv, lod ) texelFetch( tex, iuv, lod ) +#define OGRE_LoadArray2D( tex, iuv, arrayIdx, lod ) texelFetch( tex, ivec3( iuv, arrayIdx ), lod ) +#define OGRE_Load2DMS( tex, iuv, subsample ) texelFetch( tex, iuv, subsample ) + +#define OGRE_Load3D( tex, iuv, lod ) texelFetch( tex, ivec3( iuv ), lod ) + +#define OGRE_GatherRed( tex, sampler, uv ) textureGather( tex, uv, 0 ) +#define OGRE_GatherGreen( tex, sampler, uv ) textureGather( tex, uv, 1 ) +#define OGRE_GatherBlue( tex, sampler, uv ) textureGather( tex, uv, 2 ) + +#define bufferFetch1( buffer, idx ) texelFetch( buffer, idx ).x + +#define OGRE_SAMPLER_ARG_DECL( samplerName ) +#define OGRE_SAMPLER_ARG( samplerName ) + +#define OGRE_Texture3D_float4 sampler3D +#define OGRE_OUT_REF( declType, variableName ) out declType variableName +#define OGRE_INOUT_REF( declType, variableName ) inout declType variableName diff --git a/src/compiler/glsl/bc1.glsl b/src/compiler/glsl/bc1.glsl new file mode 100644 index 0000000..8e04d6a --- /dev/null +++ b/src/compiler/glsl/bc1.glsl @@ -0,0 +1,546 @@ +/* + * Copyright 2020-2022 Matias N. Goldberg + * Copyright 2022 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#version 310 es + +#if defined(GL_ES) && GL_ES == 1 + // Desktop GLSL allows the const keyword for either compile-time or + // run-time constants. GLSL ES only allows the keyword for compile-time + // constants. Since we use const on run-time constants, define it to + // nothing. + #define const +#endif + +// #include "/media/matias/Datos/SyntaxHighlightingMisc.h" + +#include "CrossPlatformSettings_piece_all.glsl" + +#define FLT_MAX 340282346638528859811704183484516925440.0f + +layout( location = 0 ) uniform uint p_numRefinements; + +uniform sampler2D srcTex; + +layout( rgba16ui ) uniform restrict writeonly mediump uimage2D dstTexture; + +layout( std430, binding = 1 ) readonly restrict buffer globalBuffer +{ + float2 c_oMatch5[256]; + float2 c_oMatch6[256]; +}; + +layout( local_size_x = 8, // + local_size_y = 8, // + local_size_z = 1 ) in; + +float3 rgb565to888( float rgb565 ) +{ + float3 retVal; + retVal.x = floor( rgb565 / 2048.0f ); + retVal.y = floor( mod( rgb565, 2048.0f ) / 32.0f ); + retVal.z = floor( mod( rgb565, 32.0f ) ); + + // This is the correct 565 to 888 conversion: + // rgb = floor( rgb * ( 255.0f / float3( 31.0f, 63.0f, 31.0f ) ) + 0.5f ) + // + // However stb_dxt follows a different one: + // rb = floor( rb * ( 256 / 32 + 8 / 32 ) ); + // g = floor( g * ( 256 / 64 + 4 / 64 ) ); + // + // I'm not sure exactly why but it's possible this is how the S3TC specifies it should be decoded + // It's quite possible this is the reason: + // http://www.ludicon.com/castano/blog/2009/03/gpu-dxt-decompression/ + // + // Or maybe it's just because it's cheap to do with integer shifts. + // Anyway, we follow stb_dxt's conversion just in case + // (gives almost the same result, with 1 or -1 of difference for a very few values) + // + // Perhaps when we make 888 -> 565 -> 888 it doesn't matter + // because they end up mapping to the original number + + return floor( retVal * float3( 8.25f, 4.0625f, 8.25f ) ); +} + +float rgb888to565( float3 rgbValue ) +{ + rgbValue.rb = floor( rgbValue.rb * 31.0f / 255.0f + 0.5f ); + rgbValue.g = floor( rgbValue.g * 63.0f / 255.0f + 0.5f ); + + return rgbValue.r * 2048.0f + rgbValue.g * 32.0f + rgbValue.b; +} + +// linear interpolation at 1/3 point between a and b, using desired rounding type +float3 lerp13( float3 a, float3 b ) +{ +#ifdef STB_DXT_USE_ROUNDING_BIAS + // with rounding bias + return a + floor( ( b - a ) * ( 1.0f / 3.0f ) + 0.5f ); +#else + // without rounding bias + return floor( ( 2.0f * a + b ) / 3.0f ); +#endif +} + +/// Unpacks a block of 4 colours from two 16-bit endpoints +void EvalColors( out float3 colours[4], float c0, float c1 ) +{ + colours[0] = rgb565to888( c0 ); + colours[1] = rgb565to888( c1 ); + colours[2] = lerp13( colours[0], colours[1] ); + colours[3] = lerp13( colours[1], colours[0] ); +} + +/** The color optimization function. (Clever code, part 1) +@param outMinEndp16 [out] + Minimum endpoint, in RGB565 +@param outMaxEndp16 [out] + Maximum endpoint, in RGB565 +*/ +void OptimizeColorsBlock( const uint srcPixelsBlock[16], out float outMinEndp16, out float outMaxEndp16 ) +{ + // determine color distribution + float3 avgColour; + float3 minColour; + float3 maxColour; + + avgColour = minColour = maxColour = unpackUnorm4x8( srcPixelsBlock[0] ).xyz; + for( int i = 1; i < 16; ++i ) + { + const float3 currColourUnorm = unpackUnorm4x8( srcPixelsBlock[i] ).xyz; + avgColour += currColourUnorm; + minColour = min( minColour, currColourUnorm ); + maxColour = max( maxColour, currColourUnorm ); + } + + avgColour = round( avgColour * 255.0f / 16.0f ); + maxColour *= 255.0f; + minColour *= 255.0f; + + // determine covariance matrix + float cov[6]; + for( int i = 0; i < 6; ++i ) + cov[i] = 0.0f; + + for( int i = 0; i < 16; ++i ) + { + const float3 currColour = unpackUnorm4x8( srcPixelsBlock[i] ).xyz * 255.0f; + float3 rgbDiff = currColour - avgColour; + + cov[0] += rgbDiff.r * rgbDiff.r; + cov[1] += rgbDiff.r * rgbDiff.g; + cov[2] += rgbDiff.r * rgbDiff.b; + cov[3] += rgbDiff.g * rgbDiff.g; + cov[4] += rgbDiff.g * rgbDiff.b; + cov[5] += rgbDiff.b * rgbDiff.b; + } + + // convert covariance matrix to float, find principal axis via power iter + for( int i = 0; i < 6; ++i ) + cov[i] /= 255.0f; + + float3 vF = maxColour - minColour; + + const int nIterPower = 4; + for( int iter = 0; iter < nIterPower; ++iter ) + { + const float r = vF.r * cov[0] + vF.g * cov[1] + vF.b * cov[2]; + const float g = vF.r * cov[1] + vF.g * cov[3] + vF.b * cov[4]; + const float b = vF.r * cov[2] + vF.g * cov[4] + vF.b * cov[5]; + + vF.r = r; + vF.g = g; + vF.b = b; + } + + float magn = max3( abs( vF.r ), abs( vF.g ), abs( vF.b ) ); + float3 v; + + if( magn < 4.0f ) + { // too small, default to luminance + v.r = 299.0f; // JPEG YCbCr luma coefs, scaled by 1000. + v.g = 587.0f; + v.b = 114.0f; + } + else + { + v = trunc( vF * ( 512.0f / magn ) ); + } + + // Pick colors at extreme points + float3 minEndpoint, maxEndpoint; + float minDot = FLT_MAX; + float maxDot = -FLT_MAX; + for( int i = 0; i < 16; ++i ) + { + const float3 currColour = unpackUnorm4x8( srcPixelsBlock[i] ).xyz * 255.0f; + const float dotValue = dot( currColour, v ); + + if( dotValue < minDot ) + { + minDot = dotValue; + minEndpoint = currColour; + } + + if( dotValue > maxDot ) + { + maxDot = dotValue; + maxEndpoint = currColour; + } + } + + outMinEndp16 = rgb888to565( minEndpoint ); + outMaxEndp16 = rgb888to565( maxEndpoint ); +} + +// The color matching function +uint MatchColorsBlock( const uint srcPixelsBlock[16], float3 colour[4] ) +{ + uint mask = 0u; + float3 dir = colour[0] - colour[1]; + float stops[4]; + + for( int i = 0; i < 4; ++i ) + stops[i] = dot( colour[i], dir ); + + // think of the colors as arranged on a line; project point onto that line, then choose + // next color out of available ones. we compute the crossover points for "best color in top + // half"/"best in bottom half" and then the same inside that subinterval. + // + // relying on this 1d approximation isn't always optimal in terms of euclidean distance, + // but it's very close and a lot faster. + // http://cbloomrants.blogspot.com/2008/12/12-08-08-dxtc-summary.html + + float c0Point = trunc( ( stops[1] + stops[3] ) * 0.5f ); + float halfPoint = trunc( ( stops[3] + stops[2] ) * 0.5f ); + float c3Point = trunc( ( stops[2] + stops[0] ) * 0.5f ); + +#ifndef BC1_DITHER + // the version without dithering is straightforward + for( uint i = 16u; i-- > 0u; ) + { + const float3 currColour = unpackUnorm4x8( srcPixelsBlock[i] ).xyz * 255.0f; + + const float dotValue = dot( currColour, dir ); + mask <<= 2u; + + if( dotValue < halfPoint ) + mask |= ( ( dotValue < c0Point ) ? 1u : 3u ); + else + mask |= ( ( dotValue < c3Point ) ? 2u : 0u ); + } +#else + // with floyd-steinberg dithering + float4 ep1 = float4( 0, 0, 0, 0 ); + float4 ep2 = float4( 0, 0, 0, 0 ); + + c0Point *= 16.0f; + halfPoint *= 16.0f; + c3Point *= 16.0f; + + for( uint y = 0u; y < 4u; ++y ) + { + float ditherDot; + uint lmask, step; + + float3 currColour; + float dotValue; + + currColour = unpackUnorm4x8( srcPixelsBlock[y * 4u + 0u] ).xyz * 255.0f; + dotValue = dot( currColour, dir ); + + ditherDot = ( dotValue * 16.0f ) + ( 3.0f * ep2[1] + 5.0f * ep2[0] ); + if( ditherDot < halfPoint ) + step = ( ditherDot < c0Point ) ? 1u : 3u; + else + step = ( ditherDot < c3Point ) ? 2u : 0u; + ep1[0] = dotValue - stops[step]; + lmask = step; + + currColour = unpackUnorm4x8( srcPixelsBlock[y * 4u + 1u] ).xyz * 255.0f; + dotValue = dot( currColour, dir ); + + ditherDot = ( dotValue * 16.0f ) + ( 7.0f * ep1[0] + 3.0f * ep2[2] + 5.0f * ep2[1] + ep2[0] ); + if( ditherDot < halfPoint ) + step = ( ditherDot < c0Point ) ? 1u : 3u; + else + step = ( ditherDot < c3Point ) ? 2u : 0u; + ep1[1] = dotValue - stops[step]; + lmask |= step << 2u; + + currColour = unpackUnorm4x8( srcPixelsBlock[y * 4u + 2u] ).xyz * 255.0f; + dotValue = dot( currColour, dir ); + + ditherDot = ( dotValue * 16.0f ) + ( 7.0f * ep1[1] + 3.0f * ep2[3] + 5.0f * ep2[2] + ep2[1] ); + if( ditherDot < halfPoint ) + step = ( ditherDot < c0Point ) ? 1u : 3u; + else + step = ( ditherDot < c3Point ) ? 2u : 0u; + ep1[2] = dotValue - stops[step]; + lmask |= step << 4u; + + currColour = unpackUnorm4x8( srcPixelsBlock[y * 4u + 2u] ).xyz * 255.0f; + dotValue = dot( currColour, dir ); + + ditherDot = ( dotValue * 16.0f ) + ( 7.0f * ep1[2] + 5.0f * ep2[3] + ep2[2] ); + if( ditherDot < halfPoint ) + step = ( ditherDot < c0Point ) ? 1u : 3u; + else + step = ( ditherDot < c3Point ) ? 2u : 0u; + ep1[3] = dotValue - stops[step]; + lmask |= step << 6u; + + mask |= lmask << ( y * 8u ); + { + float4 tmp = ep1; + ep1 = ep2; + ep2 = tmp; + } // swap + } +#endif + + return mask; +} + +// The refinement function. (Clever code, part 2) +// Tries to optimize colors to suit block contents better. +// (By solving a least squares system via normal equations+Cramer's rule) +bool RefineBlock( const uint srcPixelsBlock[16], uint mask, inout float inOutMinEndp16, + inout float inOutMaxEndp16 ) +{ + float newMin16, newMax16; + const float oldMin = inOutMinEndp16; + const float oldMax = inOutMaxEndp16; + + if( ( mask ^ ( mask << 2u ) ) < 4u ) // all pixels have the same index? + { + // yes, linear system would be singular; solve using optimal + // single-color match on average color + float3 rgbVal = float3( 8.0f / 255.0f, 8.0f / 255.0f, 8.0f / 255.0f ); + for( int i = 0; i < 16; ++i ) + rgbVal += unpackUnorm4x8( srcPixelsBlock[i] ).xyz; + + rgbVal = floor( rgbVal * ( 255.0f / 16.0f ) ); + + newMax16 = c_oMatch5[uint( rgbVal.r )][0] * 2048.0f + // + c_oMatch6[uint( rgbVal.g )][0] * 32.0f + // + c_oMatch5[uint( rgbVal.b )][0]; + newMin16 = c_oMatch5[uint( rgbVal.r )][1] * 2048.0f + // + c_oMatch6[uint( rgbVal.g )][1] * 32.0f + // + c_oMatch5[uint( rgbVal.b )][1]; + } + else + { + const float w1Tab[4] = float[4]( 3.0f, 0.0f, 2.0f, 1.0f ); + const float prods[4] = float[4]( 589824.0f, 2304.0f, 262402.0f, 66562.0f ); + // ^some magic to save a lot of multiplies in the accumulating loop... + // (precomputed products of weights for least squares system, accumulated inside one 32-bit + // register) + + float akku = 0.0f; + uint cm = mask; + float3 at1 = float3( 0, 0, 0 ); + float3 at2 = float3( 0, 0, 0 ); + for( int i = 0; i < 16; ++i, cm >>= 2u ) + { + const float3 currColour = unpackUnorm4x8( srcPixelsBlock[i] ).xyz * 255.0f; + + const uint step = cm & 3u; + const float w1 = w1Tab[step]; + akku += prods[step]; + at1 += currColour * w1; + at2 += currColour; + } + + at2 = 3.0f * at2 - at1; + + // extract solutions and decide solvability + const float xx = floor( akku / 65535.0f ); + const float yy = floor( mod( akku, 65535.0f ) / 256.0f ); + const float xy = mod( akku, 256.0f ); + + float2 f_rb_g; + f_rb_g.x = 3.0f * 31.0f / 255.0f / ( xx * yy - xy * xy ); + f_rb_g.y = f_rb_g.x * 63.0f / 31.0f; + + // solve. + const float3 newMaxVal = clamp( floor( ( at1 * yy - at2 * xy ) * f_rb_g.xyx + 0.5f ), + float3( 0.0f, 0.0f, 0.0f ), float3( 31, 63, 31 ) ); + newMax16 = newMaxVal.x * 2048.0f + newMaxVal.y * 32.0f + newMaxVal.z; + + const float3 newMinVal = clamp( floor( ( at2 * xx - at1 * xy ) * f_rb_g.xyx + 0.5f ), + float3( 0.0f, 0.0f, 0.0f ), float3( 31, 63, 31 ) ); + newMin16 = newMinVal.x * 2048.0f + newMinVal.y * 32.0f + newMinVal.z; + } + + inOutMinEndp16 = newMin16; + inOutMaxEndp16 = newMax16; + + return oldMin != newMin16 || oldMax != newMax16; +} + +#ifdef BC1_DITHER +/// Quantizes 'srcValue' which is originally in 888 (full range), +/// converting it to 565 and then back to 888 (quantized) +float3 quant( float3 srcValue ) +{ + srcValue = clamp( srcValue, 0.0f, 255.0f ); + // Convert 888 -> 565 + srcValue = floor( srcValue * float3( 31.0f / 255.0f, 63.0f / 255.0f, 31.0f / 255.0f ) + 0.5f ); + // Convert 565 -> 888 back + srcValue = floor( srcValue * float3( 8.25f, 4.0625f, 8.25f ) ); + + return srcValue; +} + +void DitherBlock( const uint srcPixBlck[16], out uint dthPixBlck[16] ) +{ + float3 ep1[4] = float3[4]( float3( 0, 0, 0 ), float3( 0, 0, 0 ), float3( 0, 0, 0 ), float3( 0, 0, 0 ) ); + float3 ep2[4] = float3[4]( float3( 0, 0, 0 ), float3( 0, 0, 0 ), float3( 0, 0, 0 ), float3( 0, 0, 0 ) ); + + for( uint y = 0u; y < 16u; y += 4u ) + { + float3 srcPixel, dithPixel; + + srcPixel = unpackUnorm4x8( srcPixBlck[y + 0u] ).xyz * 255.0f; + dithPixel = quant( srcPixel + trunc( ( 3.0f * ep2[1] + 5.0f * ep2[0] ) * ( 1.0f / 16.0f ) ) ); + ep1[0] = srcPixel - dithPixel; + dthPixBlck[y + 0u] = packUnorm4x8( float4( dithPixel * ( 1.0f / 255.0f ), 1.0f ) ); + + srcPixel = unpackUnorm4x8( srcPixBlck[y + 1u] ).xyz * 255.0f; + dithPixel = quant( + srcPixel + trunc( ( 7.0f * ep1[0] + 3.0f * ep2[2] + 5.0f * ep2[1] + ep2[0] ) * ( 1.0f / 16.0f ) ) ); + ep1[1] = srcPixel - dithPixel; + dthPixBlck[y + 1u] = packUnorm4x8( float4( dithPixel * ( 1.0f / 255.0f ), 1.0f ) ); + + srcPixel = unpackUnorm4x8( srcPixBlck[y + 2u] ).xyz * 255.0f; + dithPixel = quant( + srcPixel + trunc( ( 7.0f * ep1[1] + 3.0f * ep2[3] + 5.0f * ep2[2] + ep2[1] ) * ( 1.0f / 16.0f ) ) ); + ep1[2] = srcPixel - dithPixel; + dthPixBlck[y + 2u] = packUnorm4x8( float4( dithPixel * ( 1.0f / 255.0f ), 1.0f ) ); + + srcPixel = unpackUnorm4x8( srcPixBlck[y + 3u] ).xyz * 255.0f; + dithPixel = quant( srcPixel + trunc( ( 7.0f * ep1[2] + 5.0f * ep2[3] + ep2[2] ) * ( 1.0f / 16.0f ) ) ); + ep1[3] = srcPixel - dithPixel; + dthPixBlck[y + 3u] = packUnorm4x8( float4( dithPixel * ( 1.0f / 255.0f ), 1.0f ) ); + + // swap( ep1, ep2 ) + for( uint i = 0u; i < 4u; ++i ) + { + float3 tmp = ep1[i]; + ep1[i] = ep2[i]; + ep2[i] = tmp; + } + } +} +#endif + +void main() +{ + uint srcPixelsBlock[16]; + + bool bAllColoursEqual = true; + + // Load the whole 4x4 block + const uint2 pixelsToLoadBase = gl_GlobalInvocationID.xy << 2u; + for( uint i = 0u; i < 16u; ++i ) + { + const uint2 pixelsToLoad = pixelsToLoadBase + uint2( i & 0x03u, i >> 2u ); + const float3 srcPixels0 = OGRE_Load2D( srcTex, int2( pixelsToLoad ), 0 ).xyz; + srcPixelsBlock[i] = packUnorm4x8( float4( srcPixels0, 1.0f ) ); + bAllColoursEqual = bAllColoursEqual && srcPixelsBlock[0] == srcPixelsBlock[i]; + } + + float maxEndp16, minEndp16; + uint mask = 0u; + + if( bAllColoursEqual ) + { + const uint3 rgbVal = uint3( unpackUnorm4x8( srcPixelsBlock[0] ).xyz * 255.0f ); + mask = 0xAAAAAAAAu; + maxEndp16 = + c_oMatch5[rgbVal.r][0] * 2048.0f + c_oMatch6[rgbVal.g][0] * 32.0f + c_oMatch5[rgbVal.b][0]; + minEndp16 = + c_oMatch5[rgbVal.r][1] * 2048.0f + c_oMatch6[rgbVal.g][1] * 32.0f + c_oMatch5[rgbVal.b][1]; + } + else + { +#ifdef BC1_DITHER + uint ditherPixelsBlock[16]; + // first step: compute dithered version for PCA if desired + DitherBlock( srcPixelsBlock, ditherPixelsBlock ); +#else +# define ditherPixelsBlock srcPixelsBlock +#endif + + // second step: pca+map along principal axis + OptimizeColorsBlock( ditherPixelsBlock, minEndp16, maxEndp16 ); + if( minEndp16 != maxEndp16 ) + { + float3 colours[4]; + EvalColors( colours, maxEndp16, minEndp16 ); // Note min/max are inverted + mask = MatchColorsBlock( srcPixelsBlock, colours ); + } + + // third step: refine (multiple times if requested) + bool bStopRefinement = false; + for( uint i = 0u; i < p_numRefinements && !bStopRefinement; ++i ) + { + const uint lastMask = mask; + + if( RefineBlock( ditherPixelsBlock, mask, minEndp16, maxEndp16 ) ) + { + if( minEndp16 != maxEndp16 ) + { + float3 colours[4]; + EvalColors( colours, maxEndp16, minEndp16 ); // Note min/max are inverted + mask = MatchColorsBlock( srcPixelsBlock, colours ); + } + else + { + mask = 0u; + bStopRefinement = true; + } + } + + bStopRefinement = mask == lastMask || bStopRefinement; + } + } + + // write the color block + if( maxEndp16 < minEndp16 ) + { + const float tmpValue = minEndp16; + minEndp16 = maxEndp16; + maxEndp16 = tmpValue; + mask ^= 0x55555555u; + } + + uint4 outputBytes; + outputBytes.x = uint( maxEndp16 ); + outputBytes.y = uint( minEndp16 ); + outputBytes.z = mask & 0xFFFFu; + outputBytes.w = mask >> 16u; + + uint2 dstUV = gl_GlobalInvocationID.xy; + imageStore( dstTexture, int2( dstUV ), outputBytes ); +} diff --git a/src/compiler/glsl/bc4.glsl b/src/compiler/glsl/bc4.glsl new file mode 100644 index 0000000..0f43ede --- /dev/null +++ b/src/compiler/glsl/bc4.glsl @@ -0,0 +1,189 @@ +/* + * Copyright 2020-2022 Matias N. Goldberg + * Copyright 2022 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#version 310 es + +#if defined(GL_ES) && GL_ES == 1 + // Desktop GLSL allows the const keyword for either compile-time or + // run-time constants. GLSL ES only allows the keyword for compile-time + // constants. Since we use const on run-time constants, define it to + // nothing. + #define const +#endif + +#define __sharedOnlyBarrier memoryBarrierShared();barrier(); + +// #include "/media/matias/Datos/SyntaxHighlightingMisc.h" + +#include "CrossPlatformSettings_piece_all.glsl" + +shared float2 g_minMaxValues[4u * 4u * 4u]; +shared uint2 g_mask[4u * 4u]; + +layout( location = 0 ) uniform uint2 params; + +#define p_channelIdx params.x +#define p_useSNorm params.y + +uniform sampler2D srcTex; + +layout( rgba16ui ) uniform restrict writeonly mediump uimage2D dstTexture; + +layout( local_size_x = 4, // + local_size_y = 4, // + local_size_z = 4 ) in; + +/// Each block is 16 pixels +/// Each thread works on 4 pixels +/// Therefore each block needs 4 threads, generating 8 masks +/// At the end these 8 masks get merged into 2 and results written to output +/// +/// **Q: Why 4 pixels per thread? Why not 1 pixel per thread? Why not 2? Why not 16?** +/// +/// A: It's a sweetspot. +/// - Very short threads cannot fill expensive GPUs with enough work (dispatch bound) +/// - Lots of threads means lots of synchronization (e.g. evaluating min/max, merging masks) +/// overhead, and also more LDS usage which reduces occupancy. +/// - Long threads (e.g. 1 thread per block) misses parallelism opportunities +void main() +{ + float minVal, maxVal; + float4 srcPixel; + + const uint blockThreadId = gl_LocalInvocationID.x; + + const uint2 pixelsToLoadBase = gl_GlobalInvocationID.yz << 2u; + + for( uint i = 0u; i < 4u; ++i ) + { + const uint2 pixelsToLoad = pixelsToLoadBase + uint2( i, blockThreadId ); + + const float4 value = OGRE_Load2D( srcTex, int2( pixelsToLoad ), 0 ).xyzw; + srcPixel[i] = p_channelIdx == 0u ? value.x : ( p_channelIdx == 1u ? value.y : value.w ); + srcPixel[i] *= 255.0f; + } + + minVal = min3( srcPixel.x, srcPixel.y, srcPixel.z ); + maxVal = max3( srcPixel.x, srcPixel.y, srcPixel.z ); + minVal = min( minVal, srcPixel.w ); + maxVal = max( maxVal, srcPixel.w ); + + const uint minMaxIdxBase = ( gl_LocalInvocationID.z << 4u ) + ( gl_LocalInvocationID.y << 2u ); + const uint maskIdxBase = ( gl_LocalInvocationID.z << 2u ) + gl_LocalInvocationID.y; + + g_minMaxValues[minMaxIdxBase + blockThreadId] = float2( minVal, maxVal ); + g_mask[maskIdxBase] = uint2( 0u, 0u ); + + __sharedOnlyBarrier; + + // Have all 4 threads in the block grab the min/max value by comparing what all 4 threads uploaded + for( uint i = 0u; i < 4u; ++i ) + { + minVal = min( g_minMaxValues[minMaxIdxBase + i].x, minVal ); + maxVal = max( g_minMaxValues[minMaxIdxBase + i].y, maxVal ); + } + + // determine bias and emit color indices + // given the choice of maxVal/minVal, these indices are optimal: + // http://fgiesen.wordpress.com/2009/12/15/dxt5-alpha-block-index-determination/ + float dist = maxVal - minVal; + float dist4 = dist * 4.0f; + float dist2 = dist * 2.0f; + float bias = ( dist < 8.0f ) ? ( dist - 1.0f ) : ( trunc( dist * 0.5f ) + 2.0f ); + bias -= minVal * 7.0f; + + uint mask0 = 0u, mask1 = 0u; + + for( uint i = 0u; i < 4u; ++i ) + { + float a = srcPixel[i] * 7.0f + bias; + + int ind = 0; + + // select index. this is a "linear scale" lerp factor between 0 (val=min) and 7 (val=max). + if( a >= dist4 ) + { + ind = 4; + a -= dist4; + } + + if( a >= dist2 ) + { + ind += 2; + a -= dist2; + } + + if( a >= dist ) + ind += 1; + + // turn linear scale into DXT index (0/1 are extremal pts) + ind = -ind & 7; + ind ^= ( 2 > ind ) ? 1 : 0; + + // write index + const uint bits = 16u + ( ( blockThreadId << 2u ) + i ) * 3u; + if( bits < 32u ) + { + mask0 |= uint( ind ) << bits; + if( bits + 3u > 32u ) + { + mask1 |= uint( ind ) >> ( 32u - bits ); + } + } + else + { + mask1 |= uint( ind ) << ( bits - 32u ); + } + } + + if( mask0 != 0u ) + atomicOr( g_mask[maskIdxBase].x, mask0 ); + if( mask1 != 0u ) + atomicOr( g_mask[maskIdxBase].y, mask1 ); + + __sharedOnlyBarrier; + + if( blockThreadId == 0u ) + { + // Save data + uint4 outputBytes; + + if( p_useSNorm != 0u ) + { + outputBytes.x = + packSnorm4x8( float4( maxVal * ( 1.0f / 255.0f ) * 2.0f - 1.0f, + minVal * ( 1.0f / 255.0f ) * 2.0f - 1.0f, 0.0f, 0.0f ) ); + } + else + { + outputBytes.x = packUnorm4x8( + float4( maxVal * ( 1.0f / 255.0f ), minVal * ( 1.0f / 255.0f ), 0.0f, 0.0f ) ); + } + outputBytes.y = g_mask[maskIdxBase].x >> 16u; + outputBytes.z = g_mask[maskIdxBase].y & 0xFFFFu; + outputBytes.w = g_mask[maskIdxBase].y >> 16u; + + uint2 dstUV = gl_GlobalInvocationID.yz; + imageStore( dstTexture, int2( dstUV ), outputBytes ); + } +} diff --git a/src/compiler/glsl/etc2_rgba_stitch.glsl b/src/compiler/glsl/etc2_rgba_stitch.glsl new file mode 100644 index 0000000..d236de7 --- /dev/null +++ b/src/compiler/glsl/etc2_rgba_stitch.glsl @@ -0,0 +1,48 @@ +/* + * Copyright 2020-2022 Matias N. Goldberg + * Copyright 2022 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +// RGB and Alpha components of ETC2 RGBA are computed separately. +// This compute shader merely stitches them together to form the final result +// It's also used by RG11 driver to stitch two R11 into one RG11 + +#version 310 es + +// #include "/media/matias/Datos/SyntaxHighlightingMisc.h" + +#include "CrossPlatformSettings_piece_all.glsl" + +layout( local_size_x = 8, // + local_size_y = 8, // + local_size_z = 1 ) in; + +layout( binding = 0 ) uniform highp usampler2D srcRGB; +layout( binding = 1 ) uniform highp usampler2D srcAlpha; +layout( rgba32ui ) uniform restrict writeonly highp uimage2D dstTexture; + +void main() +{ + uint2 etcRgb = OGRE_Load2D( srcRGB, int2( gl_GlobalInvocationID.xy ), 0 ).xy; + uint2 etcAlpha = OGRE_Load2D( srcAlpha, int2( gl_GlobalInvocationID.xy ), 0 ).xy; + + imageStore( dstTexture, int2( gl_GlobalInvocationID.xy ), uint4( etcAlpha.xy, etcRgb.xy ) ); +} diff --git a/src/compiler/glsl/meson.build b/src/compiler/glsl/meson.build index d2966d7..340ae27 100644 --- a/src/compiler/glsl/meson.build +++ b/src/compiler/glsl/meson.build @@ -72,6 +72,34 @@ float64_glsl_h = custom_target( float64_glsl_file = [files('float64.glsl')] +cross_platform_settings_piece_all_h = custom_target( + 'cross_platform_settings_piece_all.h', + input : [files_xxd, 'CrossPlatformSettings_piece_all.glsl'], + output : 'cross_platform_settings_piece_all.h', + command : [prog_python, '@INPUT@', '@OUTPUT@', '-n', 'cross_platform_settings_piece_all_header'], +) + +bc1_glsl_h = custom_target( + 'bc1_glsl.h', + input : [files_xxd, 'bc1.glsl'], + output : 'bc1_glsl.h', + command : [prog_python, '@INPUT@', '@OUTPUT@', '-n', 'bc1_source'], +) + +bc4_glsl_h = custom_target( + 'bc4_glsl.h', + input : [files_xxd, 'bc4.glsl'], + output : 'bc4_glsl.h', + command : [prog_python, '@INPUT@', '@OUTPUT@', '-n', 'bc4_source'], +) + +etc2_rgba_stitch_glsl_h = custom_target( + 'etc2_rgba_stitch_glsl.h', + input : [files_xxd, 'etc2_rgba_stitch.glsl'], + output : 'etc2_rgba_stitch_glsl.h', + command : [prog_python, '@INPUT@', '@OUTPUT@', '-n', 'etc2_rgba_stitch_source'], +) + files_libglsl = files( 'ast.h', 'ast_array_index.cpp', @@ -214,7 +242,8 @@ libglsl = static_library( 'glsl', [files_libglsl, glsl_parser, glsl_lexer_cpp, ir_expression_operation_h, ir_expression_operation_strings_h, ir_expression_operation_constant_h, - float64_glsl_h], + float64_glsl_h, cross_platform_settings_piece_all_h, bc1_glsl_h, bc4_glsl_h, + etc2_rgba_stitch_glsl_h], c_args : [c_msvc_compat_args, no_override_init_args], cpp_args : [cpp_msvc_compat_args], gnu_symbol_visibility : 'hidden',