From fa1b481c09b14e01eca1b3db8e0854033f6dee3d Mon Sep 17 00:00:00 2001 From: James Benton Date: Thu, 13 Sep 2012 16:04:42 +0100 Subject: [PATCH] llvmpipe: Unswizzled rendering. Reviewed-by: Jose Fonseca --- src/gallium/auxiliary/gallivm/lp_bld_conv.c | 75 ++ src/gallium/auxiliary/gallivm/lp_bld_conv.h | 10 + src/gallium/auxiliary/gallivm/lp_bld_logic.c | 9 +- src/gallium/auxiliary/gallivm/lp_bld_logic.h | 3 +- src/gallium/auxiliary/gallivm/lp_bld_pack.c | 36 + src/gallium/auxiliary/gallivm/lp_bld_pack.h | 9 + src/gallium/auxiliary/gallivm/lp_bld_quad.c | 50 ++ src/gallium/auxiliary/gallivm/lp_bld_quad.h | 9 + src/gallium/auxiliary/gallivm/lp_bld_sample.c | 22 +- src/gallium/auxiliary/gallivm/lp_bld_swizzle.c | 137 ++- src/gallium/auxiliary/gallivm/lp_bld_swizzle.h | 25 +- src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c | 4 +- src/gallium/drivers/llvmpipe/lp_bld_blend.h | 5 +- src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c | 97 +- src/gallium/drivers/llvmpipe/lp_jit.c | 10 +- src/gallium/drivers/llvmpipe/lp_jit.h | 33 +- src/gallium/drivers/llvmpipe/lp_rast.c | 122 ++- src/gallium/drivers/llvmpipe/lp_rast.h | 2 +- src/gallium/drivers/llvmpipe/lp_rast_priv.h | 81 +- src/gallium/drivers/llvmpipe/lp_scene.c | 2 + src/gallium/drivers/llvmpipe/lp_scene.h | 1 + src/gallium/drivers/llvmpipe/lp_setup.c | 19 +- src/gallium/drivers/llvmpipe/lp_state_fs.c | 1100 ++++++++++++++++++++++- src/gallium/drivers/llvmpipe/lp_state_fs.h | 3 + src/gallium/drivers/llvmpipe/lp_test_blend.c | 2 +- src/gallium/drivers/llvmpipe/lp_texture.c | 67 ++ src/gallium/drivers/llvmpipe/lp_texture.h | 6 + 27 files changed, 1782 insertions(+), 157 deletions(-) diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/src/gallium/auxiliary/gallivm/lp_bld_conv.c index af942ad..cc44236 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c @@ -415,6 +415,81 @@ lp_build_unsigned_norm_to_float(struct gallivm_state *gallivm, /** + * Pick a suitable num_dsts for lp_build_conv to ensure optimal cases are used. + * + * Returns the number of dsts created from src + */ +int lp_build_conv_auto(struct gallivm_state *gallivm, + struct lp_type src_type, + struct lp_type* dst_type, + const LLVMValueRef *src, + unsigned num_srcs, + LLVMValueRef *dst) +{ + int i; + int num_dsts = num_srcs; + + if (src_type.floating == dst_type->floating && + src_type.width == dst_type->width && + src_type.length == dst_type->length && + src_type.fixed == dst_type->fixed && + src_type.norm == dst_type->norm && + src_type.sign == dst_type->sign) + return num_dsts; + + /* Special case 4x4f -> 1x16ub or 2x8f -> 1x16ub + */ + if (src_type.floating == 1 && + src_type.fixed == 0 && + src_type.sign == 1 && + src_type.norm == 0 && + src_type.width == 32 && + + dst_type->floating == 0 && + dst_type->fixed == 0 && + dst_type->sign == 0 && + dst_type->norm == 1 && + dst_type->width == 8) + { + /* Special case 4x4f --> 1x16ub */ + if (src_type.length == 4 && util_cpu_caps.has_sse2) + { + assert((num_srcs % 4) == 0); + + num_dsts = num_srcs / 4; + dst_type->length = 16; + + lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts); + return num_dsts; + } + + /* Special case 2x8f --> 1x16ub */ + if (src_type.length == 8 && util_cpu_caps.has_avx) + { + assert((num_srcs % 2) == 0); + + num_dsts = num_srcs / 2; + dst_type->length = 16; + + lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts); + return num_dsts; + } + } + + /* lp_build_resize does not support M:N */ + if (src_type.width == dst_type->width) { + lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts); + } else { + for (i = 0; i < num_srcs; ++i) { + lp_build_conv(gallivm, src_type, *dst_type, &src[i], 1, &dst[i], 1); + } + } + + return num_dsts; +} + + +/** * Generic type conversion. * * TODO: Take a precision argument, or even better, add a new precision member diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.h b/src/gallium/auxiliary/gallivm/lp_bld_conv.h index ef22105..42a1113 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_conv.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.h @@ -70,6 +70,16 @@ lp_build_conv(struct gallivm_state *gallivm, const LLVMValueRef *srcs, unsigned num_srcs, LLVMValueRef *dsts, unsigned num_dsts); + +int +lp_build_conv_auto(struct gallivm_state *gallivm, + struct lp_type src_type, + struct lp_type* dst_type, + const LLVMValueRef *src, + unsigned num_srcs, + LLVMValueRef *dst); + + void lp_build_conv_mask(struct gallivm_state *gallivm, struct lp_type src_type, diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.c b/src/gallium/auxiliary/gallivm/lp_bld_logic.c index 8a77a43..f56b61b 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_logic.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.c @@ -560,7 +560,8 @@ LLVMValueRef lp_build_select_aos(struct lp_build_context *bld, unsigned mask, LLVMValueRef a, - LLVMValueRef b) + LLVMValueRef b, + unsigned num_channels) { LLVMBuilderRef builder = bld->gallivm->builder; const struct lp_type type = bld->type; @@ -594,8 +595,8 @@ lp_build_select_aos(struct lp_build_context *bld, LLVMTypeRef elem_type = LLVMInt32TypeInContext(bld->gallivm->context); LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH]; - for(j = 0; j < n; j += 4) - for(i = 0; i < 4; ++i) + for(j = 0; j < n; j += num_channels) + for(i = 0; i < num_channels; ++i) shuffles[j + i] = LLVMConstInt(elem_type, (mask & (1 << i) ? 0 : n) + j + i, 0); @@ -603,7 +604,7 @@ lp_build_select_aos(struct lp_build_context *bld, return LLVMBuildShuffleVector(builder, a, b, LLVMConstVector(shuffles, n), ""); } else { - LLVMValueRef mask_vec = lp_build_const_mask_aos(bld->gallivm, type, mask, 4); + LLVMValueRef mask_vec = lp_build_const_mask_aos(bld->gallivm, type, mask, num_channels); return lp_build_select(bld, mask_vec, a, b); } } diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.h b/src/gallium/auxiliary/gallivm/lp_bld_logic.h index 64c0a1f..f530424 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_logic.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.h @@ -79,7 +79,8 @@ LLVMValueRef lp_build_select_aos(struct lp_build_context *bld, unsigned mask, LLVMValueRef a, - LLVMValueRef b); + LLVMValueRef b, + unsigned num_channels); LLVMValueRef diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.c b/src/gallium/auxiliary/gallivm/lp_bld_pack.c index e57d414..b467d56 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_pack.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.c @@ -211,6 +211,42 @@ lp_build_concat(struct gallivm_state *gallivm, return tmp[0]; } + +/** + * Combines vectors to reduce from num_srcs to num_dsts. + * Returns the number of src vectors concatenated in a single dst. + * + * num_srcs must be exactly divisible by num_dsts. + * + * e.g. For num_srcs = 4 and src = [x, y, z, w] + * num_dsts = 1 dst = [xyzw] return = 4 + * num_dsts = 2 dst = [xy, zw] return = 2 + */ +int +lp_build_concat_n(struct gallivm_state *gallivm, + struct lp_type src_type, + LLVMValueRef *src, + unsigned num_srcs, + LLVMValueRef *dst, + unsigned num_dsts) +{ + int size = num_srcs / num_dsts; + int i; + + assert(num_srcs >= num_dsts); + assert((num_srcs % size) == 0); + + if (num_srcs == num_dsts) + return 1; + + for (i = 0; i < num_dsts; ++i) { + dst[i] = lp_build_concat(gallivm, &src[i * size], src_type, size); + } + + return size; +} + + /** * Interleave vector elements. * diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.h b/src/gallium/auxiliary/gallivm/lp_bld_pack.h index f734c60..7cede35 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_pack.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.h @@ -87,6 +87,15 @@ lp_build_concat(struct gallivm_state *gallivm, struct lp_type src_type, unsigned num_vectors); +int +lp_build_concat_n(struct gallivm_state *gallivm, + struct lp_type src_type, + LLVMValueRef *src, + unsigned num_srcs, + LLVMValueRef *dst, + unsigned num_dsts); + + LLVMValueRef lp_build_packs2(struct gallivm_state *gallivm, struct lp_type src_type, diff --git a/src/gallium/auxiliary/gallivm/lp_bld_quad.c b/src/gallium/auxiliary/gallivm/lp_bld_quad.c index c7c58ed..8a0efed 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_quad.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_quad.c @@ -31,6 +31,7 @@ #include "lp_bld_const.h" #include "lp_bld_swizzle.h" #include "lp_bld_quad.h" +#include "lp_bld_pack.h" static const unsigned char @@ -156,3 +157,52 @@ lp_build_packed_ddx_ddy_twocoord(struct lp_build_context *bld, return LLVMBuildSub(builder, vec2, vec1, "ddxddyddxddy"); } + +/** + * Twiddle from quad format to row format + * + * src0 src1 + * ######### ######### ################# + * # 0 | 1 # # 4 | 5 # # 0 | 1 | 4 | 5 # src0 + * #---+---# #---+---# -> ################# + * # 2 | 3 # # 6 | 7 # # 2 | 3 | 6 | 7 # src1 + * ######### ######### ################# + * + */ +void +lp_bld_quad_twiddle(struct gallivm_state *gallivm, + struct lp_type lp_dst_type, + const LLVMValueRef* src, + unsigned src_count, + LLVMValueRef* dst) +{ + LLVMBuilderRef builder = gallivm->builder; + LLVMTypeRef dst_type_ref; + LLVMTypeRef type2_ref; + struct lp_type type2; + unsigned i; + + assert((src_count % 2) == 0); + + /* Create a type with only 2 elements */ + type2 = lp_dst_type; + type2.width = (lp_dst_type.width * lp_dst_type.length) / 2; + type2.length = 2; + type2.floating = 0; + + type2_ref = lp_build_vec_type(gallivm, type2); + dst_type_ref = lp_build_vec_type(gallivm, lp_dst_type); + + for (i = 0; i < src_count; i += 2) { + LLVMValueRef src0, src1; + + src0 = LLVMBuildBitCast(builder, src[i + 0], type2_ref, ""); + src1 = LLVMBuildBitCast(builder, src[i + 1], type2_ref, ""); + + dst[i + 0] = lp_build_interleave2(gallivm, type2, src0, src1, 0); + dst[i + 1] = lp_build_interleave2(gallivm, type2, src0, src1, 1); + + dst[i + 0] = LLVMBuildBitCast(builder, dst[i + 0], dst_type_ref, ""); + dst[i + 1] = LLVMBuildBitCast(builder, dst[i + 1], dst_type_ref, ""); + } +} diff --git a/src/gallium/auxiliary/gallivm/lp_bld_quad.h b/src/gallium/auxiliary/gallivm/lp_bld_quad.h index be6a1ef..e41f80e 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_quad.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_quad.h @@ -88,5 +88,14 @@ LLVMValueRef lp_build_packed_ddx_ddy_onecoord(struct lp_build_context *bld, LLVMValueRef a); +/* + * Twiddle from quad format to row format + */ +void +lp_bld_quad_twiddle(struct gallivm_state *gallivm, + struct lp_type lp_dst_type, + const LLVMValueRef* src, + unsigned src_count, + LLVMValueRef* dst); #endif /* LP_BLD_QUAD_H_ */ diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c index 37490e4..8ea5f5e 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c @@ -772,7 +772,7 @@ lp_build_get_mip_offsets(struct lp_build_sample_context *bld, offset1 = LLVMBuildLoad(builder, offset1, ""); offsets = LLVMBuildInsertElement(builder, offsets, offset1, indexo, ""); } - offsets = lp_build_swizzle_scalar_aos(&bld->int_coord_bld, offsets, 0); + offsets = lp_build_swizzle_scalar_aos(&bld->int_coord_bld, offsets, 0, 4); } else { unsigned i; @@ -849,7 +849,7 @@ lp_build_get_level_stride_vec(struct lp_build_sample_context *bld, stride1 = LLVMBuildLoad(builder, stride1, ""); stride = LLVMBuildInsertElement(builder, stride, stride1, indexo, ""); } - stride = lp_build_swizzle_scalar_aos(&bld->int_coord_bld, stride, 0); + stride = lp_build_swizzle_scalar_aos(&bld->int_coord_bld, stride, 0, 4); } else { LLVMValueRef stride1; @@ -1045,11 +1045,11 @@ lp_build_extract_image_sizes(struct lp_build_sample_context *bld, *out_width = size; } else if (bld->num_lods == num_quads) { - *out_width = lp_build_swizzle_scalar_aos(size_bld, size, 0); + *out_width = lp_build_swizzle_scalar_aos(size_bld, size, 0, 4); if (dims >= 2) { - *out_height = lp_build_swizzle_scalar_aos(size_bld, size, 1); + *out_height = lp_build_swizzle_scalar_aos(size_bld, size, 1, 4); if (dims == 3) { - *out_depth = lp_build_swizzle_scalar_aos(size_bld, size, 2); + *out_depth = lp_build_swizzle_scalar_aos(size_bld, size, 2, 4); } } } @@ -1246,9 +1246,9 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld, signrxyz = LLVMBuildBitCast(builder, rxyz, lp_build_vec_type(gallivm, intctype), ""); signrxyz = LLVMBuildAnd(builder, signrxyz, signmask, ""); - arxs = lp_build_swizzle_scalar_aos(coord_bld, arxyz, 0); - arys = lp_build_swizzle_scalar_aos(coord_bld, arxyz, 1); - arzs = lp_build_swizzle_scalar_aos(coord_bld, arxyz, 2); + arxs = lp_build_swizzle_scalar_aos(coord_bld, arxyz, 0, 4); + arys = lp_build_swizzle_scalar_aos(coord_bld, arxyz, 1, 4); + arzs = lp_build_swizzle_scalar_aos(coord_bld, arxyz, 2, 4); /* * select x if x >= y else select y @@ -1267,15 +1267,15 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld, * snewz = signrz * rx; * tnewz = -ry; */ - signrxs = lp_build_swizzle_scalar_aos(cint_bld, signrxyz, 0); + signrxs = lp_build_swizzle_scalar_aos(cint_bld, signrxyz, 0, 4); snewx = LLVMBuildXor(builder, signrxs, rzneg, ""); tnewx = ryneg; - signrys = lp_build_swizzle_scalar_aos(cint_bld, signrxyz, 1); + signrys = lp_build_swizzle_scalar_aos(cint_bld, signrxyz, 1, 4); snewy = rx; tnewy = LLVMBuildXor(builder, signrys, rz, ""); - signrzs = lp_build_swizzle_scalar_aos(cint_bld, signrxyz, 2); + signrzs = lp_build_swizzle_scalar_aos(cint_bld, signrxyz, 2, 4); snewz = LLVMBuildXor(builder, signrzs, rx, ""); tnewz = ryneg; diff --git a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c index 4ae4f37..377884a 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c @@ -159,21 +159,24 @@ lp_build_extract_broadcast(struct gallivm_state *gallivm, /** - * Swizzle one channel into all other three channels. + * Swizzle one channel into other channels. */ LLVMValueRef lp_build_swizzle_scalar_aos(struct lp_build_context *bld, LLVMValueRef a, - unsigned channel) + unsigned channel, + unsigned num_channels) { LLVMBuilderRef builder = bld->gallivm->builder; const struct lp_type type = bld->type; const unsigned n = type.length; unsigned i, j; - if(a == bld->undef || a == bld->zero || a == bld->one) + if(a == bld->undef || a == bld->zero || a == bld->one || num_channels == 1) return a; + assert(num_channels == 2 || num_channels == 4); + /* XXX: SSE3 has PSHUFB which should be better than bitmasks, but forcing * using shuffles here actually causes worst results. More investigation is * needed. */ @@ -184,12 +187,55 @@ lp_build_swizzle_scalar_aos(struct lp_build_context *bld, LLVMTypeRef elem_type = LLVMInt32TypeInContext(bld->gallivm->context); LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH]; - for(j = 0; j < n; j += 4) - for(i = 0; i < 4; ++i) + for(j = 0; j < n; j += num_channels) + for(i = 0; i < num_channels; ++i) shuffles[j + i] = LLVMConstInt(elem_type, j + channel, 0); return LLVMBuildShuffleVector(builder, a, bld->undef, LLVMConstVector(shuffles, n), ""); } + else if (num_channels == 2) { + /* + * Bit mask and shifts + * + * XY XY .... XY <= input + * 0Y 0Y .... 0Y + * YY YY .... YY + * YY YY .... YY <= output + */ + struct lp_type type2; + LLVMValueRef tmp = NULL; + int shift; + + a = LLVMBuildAnd(builder, a, + lp_build_const_mask_aos(bld->gallivm, + type, 1 << channel, num_channels), ""); + + type2 = type; + type2.floating = FALSE; + type2.width *= 2; + type2.length /= 2; + + a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, type2), ""); + +#ifdef PIPE_ARCH_LITTLE_ENDIAN + shift = channel == 0 ? 1 : -1; +#else + shift = channel == 0 ? -1 : 1; +#endif + + if (shift > 0) { + tmp = LLVMBuildShl(builder, a, lp_build_const_int_vec(bld->gallivm, type2, shift * type.width), ""); + } else if (shift < 0) { + tmp = LLVMBuildLShr(builder, a, lp_build_const_int_vec(bld->gallivm, type2, -shift * type.width), ""); + } + + assert(tmp); + if (tmp) { + a = LLVMBuildOr(builder, a, tmp, ""); + } + + return LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, type), ""); + } else { /* * Bit mask and recursive shifts @@ -247,6 +293,45 @@ lp_build_swizzle_scalar_aos(struct lp_build_context *bld, } +/** + * Swizzle a vector consisting of an array of XYZW structs. + * + * This fills a vector of dst_len length with the swizzled channels from src. + * + * e.g. with swizzles = { 2, 1, 0 } and swizzle_count = 6 results in + * RGBA RGBA = BGR BGR BG + * + * @param swizzles the swizzle array + * @param num_swizzles the number of elements in swizzles + * @param dst_len the length of the result + */ +LLVMValueRef +lp_build_swizzle_aos_n(struct gallivm_state* gallivm, + LLVMValueRef src, + const unsigned char* swizzles, + unsigned num_swizzles, + unsigned dst_len) +{ + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef shuffles[LP_MAX_VECTOR_WIDTH]; + unsigned i; + + assert(dst_len < LP_MAX_VECTOR_WIDTH); + + for (i = 0; i < dst_len; ++i) { + int swizzle = swizzles[i % num_swizzles]; + + if (swizzle == LP_BLD_SWIZZLE_DONTCARE) { + shuffles[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context)); + } else { + shuffles[i] = lp_build_const_int32(gallivm, swizzle); + } + } + + return LLVMBuildShuffleVector(builder, src, LLVMGetUndef(LLVMTypeOf(src)), LLVMConstVector(shuffles, dst_len), ""); +} + + LLVMValueRef lp_build_swizzle_aos(struct lp_build_context *bld, LLVMValueRef a, @@ -272,7 +357,7 @@ lp_build_swizzle_aos(struct lp_build_context *bld, case PIPE_SWIZZLE_GREEN: case PIPE_SWIZZLE_BLUE: case PIPE_SWIZZLE_ALPHA: - return lp_build_swizzle_scalar_aos(bld, a, swizzles[0]); + return lp_build_swizzle_scalar_aos(bld, a, swizzles[0], 4); case PIPE_SWIZZLE_ZERO: return bld->zero; case PIPE_SWIZZLE_ONE: @@ -367,7 +452,7 @@ lp_build_swizzle_aos(struct lp_build_context *bld, cond |= 1 << chan; } } - res = lp_build_select_aos(bld, cond, bld->one, bld->zero); + res = lp_build_select_aos(bld, cond, bld->one, bld->zero, 4); /* * Build a type where each element is an integer that cover the four @@ -554,6 +639,44 @@ lp_build_transpose_aos(struct gallivm_state *gallivm, /** + * Transpose from AOS <-> SOA for num_srcs + */ +void +lp_build_transpose_aos_n(struct gallivm_state *gallivm, + struct lp_type type, + const LLVMValueRef* src, + unsigned num_srcs, + LLVMValueRef* dst) +{ + switch (num_srcs) { + case 1: + dst[0] = src[0]; + break; + + case 2: + { + /* Note: we must use a temporary incase src == dst */ + LLVMValueRef lo, hi; + + lo = lp_build_interleave2_half(gallivm, type, src[0], src[1], 0); + hi = lp_build_interleave2_half(gallivm, type, src[0], src[1], 1); + + dst[0] = lo; + dst[1] = hi; + break; + } + + case 4: + lp_build_transpose_aos(gallivm, type, src, dst); + break; + + default: + assert(0); + }; +} + + +/** * Pack n-th element of aos values, * pad out to destination size. * i.e. x1 y1 _ _ x2 y2 _ _ will become x1 x2 _ _ diff --git a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h index c49d916..91ecd34 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h @@ -67,13 +67,14 @@ lp_build_extract_broadcast(struct gallivm_state *gallivm, /** - * Broadcast one channel of a vector composed of arrays of XYZW structures into - * all four channel. + * Broadcast one channel of a vector composed of arrays of XYZ.. structures into + * all channels XXX... */ LLVMValueRef lp_build_swizzle_scalar_aos(struct lp_build_context *bld, - LLVMValueRef a, - unsigned channel); + LLVMValueRef a, + unsigned channel, + unsigned num_channels); /** @@ -88,6 +89,14 @@ lp_build_swizzle_aos(struct lp_build_context *bld, LLVMValueRef +lp_build_swizzle_aos_n(struct gallivm_state* gallivm, + LLVMValueRef src, + const unsigned char* swizzles, + unsigned num_swizzles, + unsigned dst_len); + + +LLVMValueRef lp_build_swizzle_soa_channel(struct lp_build_context *bld, const LLVMValueRef *unswizzled, unsigned swizzle); @@ -113,6 +122,14 @@ lp_build_transpose_aos(struct gallivm_state *gallivm, LLVMValueRef dst[4]); +void +lp_build_transpose_aos_n(struct gallivm_state *gallivm, + struct lp_type type, + const LLVMValueRef* src, + unsigned num_srcs, + LLVMValueRef* dst); + + LLVMValueRef lp_build_pack_aos_scalars(struct gallivm_state *gallivm, struct lp_type src_type, diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c index 44f684a..dbd9ccb 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c @@ -94,7 +94,7 @@ swizzle_scalar_aos(struct lp_build_tgsi_aos_context *bld, unsigned chan) { chan = bld->swizzles[chan]; - return lp_build_swizzle_scalar_aos(&bld->bld_base.base, a, chan); + return lp_build_swizzle_scalar_aos(&bld->bld_base.base, a, chan, 4); } @@ -623,7 +623,7 @@ lp_emit_instruction_aos( case TGSI_OPCODE_EX2: src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL); - tmp0 = lp_build_swizzle_scalar_aos(&bld->bld_base.base, src0, TGSI_SWIZZLE_X); + tmp0 = lp_build_swizzle_scalar_aos(&bld->bld_base.base, src0, TGSI_SWIZZLE_X, TGSI_NUM_CHANNELS); dst0 = lp_build_exp2(&bld->bld_base.base, tmp0); break; diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend.h b/src/gallium/drivers/llvmpipe/lp_bld_blend.h index 68e55ac..75e06d1 100644 --- a/src/gallium/drivers/llvmpipe/lp_bld_blend.h +++ b/src/gallium/drivers/llvmpipe/lp_bld_blend.h @@ -60,10 +60,13 @@ lp_build_blend_aos(struct gallivm_state *gallivm, struct lp_type type, unsigned rt, LLVMValueRef src, + LLVMValueRef src_alpha, LLVMValueRef dst, LLVMValueRef mask, LLVMValueRef const_, - const unsigned char swizzle[4]); + LLVMValueRef const_alpha, + const unsigned char swizzle[4], + int nr_channels); void diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c b/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c index c615c2d..8be0b97 100644 --- a/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c +++ b/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c @@ -66,14 +66,18 @@ struct lp_build_blend_aos_context { struct lp_build_context base; - + LLVMValueRef src; + LLVMValueRef src_alpha; LLVMValueRef dst; LLVMValueRef const_; + LLVMValueRef const_alpha; LLVMValueRef inv_src; + LLVMValueRef inv_src_alpha; LLVMValueRef inv_dst; LLVMValueRef inv_const; + LLVMValueRef inv_const_alpha; LLVMValueRef saturate; LLVMValueRef rgb_src_factor; @@ -88,14 +92,18 @@ lp_build_blend_factor_unswizzled(struct lp_build_blend_aos_context *bld, unsigned factor, boolean alpha) { + LLVMValueRef src_alpha = bld->src_alpha ? bld->src_alpha : bld->src; + LLVMValueRef const_alpha = bld->const_alpha ? bld->const_alpha : bld->const_; + switch (factor) { case PIPE_BLENDFACTOR_ZERO: return bld->base.zero; case PIPE_BLENDFACTOR_ONE: return bld->base.one; case PIPE_BLENDFACTOR_SRC_COLOR: - case PIPE_BLENDFACTOR_SRC_ALPHA: return bld->src; + case PIPE_BLENDFACTOR_SRC_ALPHA: + return src_alpha; case PIPE_BLENDFACTOR_DST_COLOR: case PIPE_BLENDFACTOR_DST_ALPHA: return bld->dst; @@ -106,32 +114,39 @@ lp_build_blend_factor_unswizzled(struct lp_build_blend_aos_context *bld, if(!bld->inv_dst) bld->inv_dst = lp_build_comp(&bld->base, bld->dst); if(!bld->saturate) - bld->saturate = lp_build_min(&bld->base, bld->src, bld->inv_dst); + bld->saturate = lp_build_min(&bld->base, src_alpha, bld->inv_dst); return bld->saturate; } case PIPE_BLENDFACTOR_CONST_COLOR: - case PIPE_BLENDFACTOR_CONST_ALPHA: return bld->const_; + case PIPE_BLENDFACTOR_CONST_ALPHA: + return const_alpha; case PIPE_BLENDFACTOR_SRC1_COLOR: case PIPE_BLENDFACTOR_SRC1_ALPHA: /* TODO */ assert(0); return bld->base.zero; case PIPE_BLENDFACTOR_INV_SRC_COLOR: - case PIPE_BLENDFACTOR_INV_SRC_ALPHA: if(!bld->inv_src) bld->inv_src = lp_build_comp(&bld->base, bld->src); return bld->inv_src; + case PIPE_BLENDFACTOR_INV_SRC_ALPHA: + if(!bld->inv_src_alpha) + bld->inv_src_alpha = lp_build_comp(&bld->base, src_alpha); + return bld->inv_src_alpha; case PIPE_BLENDFACTOR_INV_DST_COLOR: case PIPE_BLENDFACTOR_INV_DST_ALPHA: if(!bld->inv_dst) bld->inv_dst = lp_build_comp(&bld->base, bld->dst); return bld->inv_dst; case PIPE_BLENDFACTOR_INV_CONST_COLOR: - case PIPE_BLENDFACTOR_INV_CONST_ALPHA: if(!bld->inv_const) bld->inv_const = lp_build_comp(&bld->base, bld->const_); return bld->inv_const; + case PIPE_BLENDFACTOR_INV_CONST_ALPHA: + if(!bld->inv_const_alpha) + bld->inv_const_alpha = lp_build_comp(&bld->base, const_alpha); + return bld->inv_const_alpha; case PIPE_BLENDFACTOR_INV_SRC1_COLOR: case PIPE_BLENDFACTOR_INV_SRC1_ALPHA: /* TODO */ @@ -190,7 +205,8 @@ lp_build_blend_swizzle(struct lp_build_blend_aos_context *bld, LLVMValueRef rgb, LLVMValueRef alpha, enum lp_build_blend_swizzle rgb_swizzle, - unsigned alpha_swizzle) + unsigned alpha_swizzle, + unsigned num_channels) { LLVMValueRef swizzled_rgb; @@ -199,7 +215,7 @@ lp_build_blend_swizzle(struct lp_build_blend_aos_context *bld, swizzled_rgb = rgb; break; case LP_BUILD_BLEND_SWIZZLE_AAAA: - swizzled_rgb = lp_build_swizzle_scalar_aos(&bld->base, rgb, alpha_swizzle); + swizzled_rgb = lp_build_swizzle_scalar_aos(&bld->base, rgb, alpha_swizzle, num_channels); break; default: assert(0); @@ -208,13 +224,13 @@ lp_build_blend_swizzle(struct lp_build_blend_aos_context *bld, if (rgb != alpha) { swizzled_rgb = lp_build_select_aos(&bld->base, 1 << alpha_swizzle, - alpha, swizzled_rgb); + alpha, swizzled_rgb, + num_channels); } return swizzled_rgb; } - /** * @sa http://www.opengl.org/sdk/docs/man/xhtml/glBlendFuncSeparate.xml */ @@ -222,17 +238,22 @@ static LLVMValueRef lp_build_blend_factor(struct lp_build_blend_aos_context *bld, unsigned rgb_factor, unsigned alpha_factor, - unsigned alpha_swizzle) + unsigned alpha_swizzle, + unsigned num_channels) { LLVMValueRef rgb_factor_, alpha_factor_; enum lp_build_blend_swizzle rgb_swizzle; + if (alpha_swizzle == 0) { + return lp_build_blend_factor_unswizzled(bld, alpha_factor, TRUE); + } + rgb_factor_ = lp_build_blend_factor_unswizzled(bld, rgb_factor, FALSE); if (alpha_swizzle != UTIL_FORMAT_SWIZZLE_NONE) { rgb_swizzle = lp_build_blend_factor_swizzle(rgb_factor); alpha_factor_ = lp_build_blend_factor_unswizzled(bld, alpha_factor, TRUE); - return lp_build_blend_swizzle(bld, rgb_factor_, alpha_factor_, rgb_swizzle, alpha_swizzle); + return lp_build_blend_swizzle(bld, rgb_factor_, alpha_factor_, rgb_swizzle, alpha_swizzle, num_channels); } else { return rgb_factor_; } @@ -261,18 +282,21 @@ lp_build_blend_aos(struct gallivm_state *gallivm, struct lp_type type, unsigned rt, LLVMValueRef src, + LLVMValueRef src_alpha, LLVMValueRef dst, LLVMValueRef mask, LLVMValueRef const_, - const unsigned char swizzle[4]) + LLVMValueRef const_alpha, + const unsigned char swizzle[4], + int nr_channels) { const struct pipe_rt_blend_state * state = &blend->rt[rt]; const struct util_format_description * desc; struct lp_build_blend_aos_context bld; LLVMValueRef src_factor, dst_factor; LLVMValueRef result; - unsigned alpha_swizzle = swizzle[3]; - boolean fullcolormask; + unsigned alpha_swizzle = UTIL_FORMAT_SWIZZLE_NONE; + unsigned i; desc = util_format_description(cbuf_format[rt]); @@ -282,20 +306,32 @@ lp_build_blend_aos(struct gallivm_state *gallivm, bld.src = src; bld.dst = dst; bld.const_ = const_; - - if (swizzle[3] > UTIL_FORMAT_SWIZZLE_W || swizzle[3] == swizzle[0]) - alpha_swizzle = UTIL_FORMAT_SWIZZLE_NONE; + bld.src_alpha = src_alpha; + bld.const_alpha = const_alpha; + + /* Find the alpha channel if not provided seperately */ + if (!src_alpha) { + for (i = 0; i < 4; ++i) { + if (swizzle[i] == 3) { + alpha_swizzle = i; + } + } + } if (!state->blend_enable) { result = src; } else { - boolean rgb_alpha_same = state->rgb_src_factor == state->rgb_dst_factor && state->alpha_src_factor == state->alpha_dst_factor; - assert(rgb_alpha_same || alpha_swizzle != UTIL_FORMAT_SWIZZLE_NONE); + boolean rgb_alpha_same = (state->rgb_src_factor == state->rgb_dst_factor && state->alpha_src_factor == state->alpha_dst_factor) || nr_channels == 1; src_factor = lp_build_blend_factor(&bld, state->rgb_src_factor, - state->alpha_src_factor, alpha_swizzle); + state->alpha_src_factor, + alpha_swizzle, + nr_channels); + dst_factor = lp_build_blend_factor(&bld, state->rgb_dst_factor, - state->alpha_dst_factor, alpha_swizzle); + state->alpha_dst_factor, + alpha_swizzle, + nr_channels); result = lp_build_blend(&bld.base, state->rgb_func, @@ -308,7 +344,7 @@ lp_build_blend_aos(struct gallivm_state *gallivm, rgb_alpha_same, false); - if(state->rgb_func != state->alpha_func && alpha_swizzle != UTIL_FORMAT_SWIZZLE_NONE) { + if(state->rgb_func != state->alpha_func && nr_channels > 1 && alpha_swizzle != UTIL_FORMAT_SWIZZLE_NONE) { LLVMValueRef alpha; alpha = lp_build_blend(&bld.base, @@ -326,22 +362,27 @@ lp_build_blend_aos(struct gallivm_state *gallivm, result, alpha, LP_BUILD_BLEND_SWIZZLE_RGBA, - alpha_swizzle); + alpha_swizzle, + nr_channels); } } /* Check if color mask is necessary */ - fullcolormask = util_format_colormask_full(util_format_description(cbuf_format[rt]), state->colormask); - - if (!fullcolormask) { + if (!util_format_colormask_full(desc, state->colormask)) { LLVMValueRef color_mask; - color_mask = lp_build_const_mask_aos_swizzled(gallivm, bld.base.type, state.colormask, desc->nr_channels, swizzle); + color_mask = lp_build_const_mask_aos_swizzled(gallivm, bld.base.type, state->colormask, nr_channels, swizzle); lp_build_name(color_mask, "color_mask"); /* Combine with input mask if necessary */ if (mask) { + /* We can be blending floating values but masks are always integer... */ + unsigned floating = bld.base.type.floating; + bld.base.type.floating = 0; + mask = lp_build_and(&bld.base, color_mask, mask); + + bld.base.type.floating = floating; } else { mask = color_mask; } diff --git a/src/gallium/drivers/llvmpipe/lp_jit.c b/src/gallium/drivers/llvmpipe/lp_jit.c index 20c53cb..2667aeb 100644 --- a/src/gallium/drivers/llvmpipe/lp_jit.c +++ b/src/gallium/drivers/llvmpipe/lp_jit.c @@ -128,7 +128,8 @@ lp_jit_create_types(struct lp_fragment_shader_variant *lp) elem_types[LP_JIT_CTX_ALPHA_REF] = LLVMFloatTypeInContext(lc); elem_types[LP_JIT_CTX_STENCIL_REF_FRONT] = elem_types[LP_JIT_CTX_STENCIL_REF_BACK] = LLVMInt32TypeInContext(lc); - elem_types[LP_JIT_CTX_BLEND_COLOR] = LLVMPointerType(LLVMInt8TypeInContext(lc), 0); + elem_types[LP_JIT_CTX_U8_BLEND_COLOR] = LLVMPointerType(LLVMInt8TypeInContext(lc), 0); + elem_types[LP_JIT_CTX_F_BLEND_COLOR] = LLVMPointerType(LLVMFloatTypeInContext(lc), 0); elem_types[LP_JIT_CTX_TEXTURES] = LLVMArrayType(texture_type, PIPE_MAX_SAMPLERS); @@ -153,9 +154,12 @@ lp_jit_create_types(struct lp_fragment_shader_variant *lp) LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, stencil_ref_back, gallivm->target, context_type, LP_JIT_CTX_STENCIL_REF_BACK); - LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, blend_color, + LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, u8_blend_color, gallivm->target, context_type, - LP_JIT_CTX_BLEND_COLOR); + LP_JIT_CTX_U8_BLEND_COLOR); + LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, f_blend_color, + gallivm->target, context_type, + LP_JIT_CTX_F_BLEND_COLOR); LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, textures, gallivm->target, context_type, LP_JIT_CTX_TEXTURES); diff --git a/src/gallium/drivers/llvmpipe/lp_jit.h b/src/gallium/drivers/llvmpipe/lp_jit.h index 94a2bb5..b469907 100644 --- a/src/gallium/drivers/llvmpipe/lp_jit.h +++ b/src/gallium/drivers/llvmpipe/lp_jit.h @@ -103,8 +103,8 @@ struct lp_jit_context uint32_t stencil_ref_front, stencil_ref_back; - /* FIXME: store (also?) in floats */ - uint8_t *blend_color; + uint8_t *u8_blend_color; + float *f_blend_color; struct lp_jit_texture textures[PIPE_MAX_SAMPLERS]; }; @@ -119,7 +119,8 @@ enum { LP_JIT_CTX_ALPHA_REF, LP_JIT_CTX_STENCIL_REF_FRONT, LP_JIT_CTX_STENCIL_REF_BACK, - LP_JIT_CTX_BLEND_COLOR, + LP_JIT_CTX_U8_BLEND_COLOR, + LP_JIT_CTX_F_BLEND_COLOR, LP_JIT_CTX_TEXTURES, LP_JIT_CTX_COUNT }; @@ -137,14 +138,33 @@ enum { #define lp_jit_context_stencil_ref_back_value(_gallivm, _ptr) \ lp_build_struct_get(_gallivm, _ptr, LP_JIT_CTX_STENCIL_REF_BACK, "stencil_ref_back") -#define lp_jit_context_blend_color(_gallivm, _ptr) \ - lp_build_struct_get(_gallivm, _ptr, LP_JIT_CTX_BLEND_COLOR, "blend_color") +#define lp_jit_context_u8_blend_color(_gallivm, _ptr) \ + lp_build_struct_get(_gallivm, _ptr, LP_JIT_CTX_U8_BLEND_COLOR, "u8_blend_color") + +#define lp_jit_context_f_blend_color(_gallivm, _ptr) \ + lp_build_struct_get(_gallivm, _ptr, LP_JIT_CTX_F_BLEND_COLOR, "f_blend_color") #define lp_jit_context_textures(_gallivm, _ptr) \ lp_build_struct_get_ptr(_gallivm, _ptr, LP_JIT_CTX_TEXTURES, "textures") +/** + * typedef for fragment shader function + * + * @param context jit context + * @param x block start x + * @param y block start y + * @param facing is front facing + * @param a0 shader input a0 + * @param dadx shader input dadx + * @param dady shader input dady + * @param color color buffer + * @param depth depth buffer + * @param mask mask of visible pixels in block + * @param thread_data task thread data + * @param stride color buffer row stride in bytes + */ typedef void (*lp_jit_frag_func)(const struct lp_jit_context *context, uint32_t x, @@ -156,7 +176,8 @@ typedef void uint8_t **color, void *depth, uint32_t mask, - uint32_t *counter); + uint32_t *counter, + unsigned *stride); void diff --git a/src/gallium/drivers/llvmpipe/lp_rast.c b/src/gallium/drivers/llvmpipe/lp_rast.c index d743d76..3d83077 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast.c +++ b/src/gallium/drivers/llvmpipe/lp_rast.c @@ -151,47 +151,70 @@ lp_rast_clear_color(struct lp_rasterizer_task *task, const union lp_rast_cmd_arg arg) { const struct lp_scene *scene = task->scene; - const uint8_t *clear_color = arg.clear_color; + uint8_t clear_color[4]; unsigned i; + boolean gray; - LP_DBG(DEBUG_RAST, "%s 0x%x,0x%x,0x%x,0x%x\n", __FUNCTION__, + for (i = 0; i < 4; ++i) { + clear_color[i] = float_to_ubyte(arg.clear_color[i]); + } + + LP_DBG(DEBUG_RAST, "%s 0x%x,0x%x,0x%x,0x%x\n", __FUNCTION__, clear_color[0], clear_color[1], clear_color[2], clear_color[3]); - if (clear_color[0] == clear_color[1] && - clear_color[1] == clear_color[2] && - clear_color[2] == clear_color[3]) { - /* clear to grayscale value {x, x, x, x} */ - for (i = 0; i < scene->fb.nr_cbufs; i++) { - uint8_t *ptr = - lp_rast_get_color_tile_pointer(task, i, LP_TEX_USAGE_WRITE_ALL); - memset(ptr, clear_color[0], TILE_SIZE * TILE_SIZE * 4); - } - } - else { - /* Non-gray color. - * Note: if the swizzled tile layout changes (see TILE_PIXEL) this code - * will need to change. It'll be pretty obvious when clearing no longer - * works. - */ - const unsigned chunk = TILE_SIZE / 4; - for (i = 0; i < scene->fb.nr_cbufs; i++) { - uint8_t *c = - lp_rast_get_color_tile_pointer(task, i, LP_TEX_USAGE_WRITE_ALL); + gray = + clear_color[0] == clear_color[1] && + clear_color[1] == clear_color[2] && + clear_color[2] == clear_color[3]; + + for (i = 0; i < scene->fb.nr_cbufs; i++) { + if (scene->cbufs[i].unswizzled) { + const struct lp_scene *scene = task->scene; + union util_color uc; + + util_pack_color(arg.clear_color, + scene->fb.cbufs[i]->format, &uc); + + util_fill_rect(scene->cbufs[i].map, + scene->fb.cbufs[i]->format, + scene->cbufs[i].stride, + task->x, + task->y, + TILE_SIZE, + TILE_SIZE, + &uc); + } else { + const unsigned chunk = TILE_SIZE / 4; + uint8_t *ptr; unsigned j; - for (j = 0; j < 4 * TILE_SIZE; j++) { - memset(c, clear_color[0], chunk); - c += chunk; - memset(c, clear_color[1], chunk); - c += chunk; - memset(c, clear_color[2], chunk); - c += chunk; - memset(c, clear_color[3], chunk); - c += chunk; + ptr = lp_rast_get_color_tile_pointer(task, i, LP_TEX_USAGE_WRITE_ALL); + + if (gray) { + /* clear to grayscale value {x, x, x, x} */ + + memset(ptr, clear_color[0], TILE_SIZE * TILE_SIZE * 4); + } else { + /* Non-gray color. + * Note: if the swizzled tile layout changes (see TILE_PIXEL) this code + * will need to change. It'll be pretty obvious when clearing no longer + * works. + */ + + for (j = 0; j < 4 * TILE_SIZE; j++) { + memset(ptr, clear_color[0], chunk); + ptr += chunk; + memset(ptr, clear_color[1], chunk); + ptr += chunk; + memset(ptr, clear_color[2], chunk); + ptr += chunk; + memset(ptr, clear_color[3], chunk); + ptr += chunk; + } } } } @@ -311,7 +334,7 @@ lp_rast_store_linear_color( struct lp_rasterizer_task *task ) const unsigned level = cbuf->u.tex.level; struct llvmpipe_resource *lpt = llvmpipe_resource(cbuf->texture); - if (!task->color_tiles[buf]) + if (scene->cbufs[buf].unswizzled || !task->color_tiles[buf]) continue; llvmpipe_unswizzle_cbuf_tile(lpt, @@ -358,13 +381,20 @@ lp_rast_shade_tile(struct lp_rasterizer_task *task, for (y = 0; y < TILE_SIZE; y += 4){ for (x = 0; x < TILE_SIZE; x += 4) { uint8_t *color[PIPE_MAX_COLOR_BUFS]; + unsigned stride[PIPE_MAX_COLOR_BUFS]; uint32_t *depth; unsigned i; /* color buffer */ - for (i = 0; i < scene->fb.nr_cbufs; i++) - color[i] = lp_rast_get_color_block_pointer(task, i, - tile_x + x, tile_y + y); + for (i = 0; i < scene->fb.nr_cbufs; i++){ + stride[i] = scene->cbufs[i].stride; + + if (scene->cbufs[i].unswizzled) { + color[i] = lp_rast_get_unswizzled_color_block_pointer(task, i, tile_x + x, tile_y + y); + } else { + color[i] = lp_rast_get_color_block_pointer(task, i, tile_x + x, tile_y + y); + } + } /* depth buffer */ depth = lp_rast_get_depth_block_pointer(task, tile_x + x, tile_y + y); @@ -380,7 +410,8 @@ lp_rast_shade_tile(struct lp_rasterizer_task *task, color, depth, 0xffff, - &task->vis_counter); + &task->vis_counter, + stride); END_JIT_CALL(); } } @@ -408,7 +439,9 @@ lp_rast_shade_tile_opaque(struct lp_rasterizer_task *task, /* this will prevent converting the layout from tiled to linear */ for (i = 0; i < scene->fb.nr_cbufs; i++) { - (void)lp_rast_get_color_tile_pointer(task, i, LP_TEX_USAGE_WRITE_ALL); + if (!scene->cbufs[i].unswizzled) { + (void)lp_rast_get_color_tile_pointer(task, i, LP_TEX_USAGE_WRITE_ALL); + } } lp_rast_shade_tile(task, arg); @@ -431,6 +464,7 @@ lp_rast_shade_quads_mask(struct lp_rasterizer_task *task, struct lp_fragment_shader_variant *variant = state->variant; const struct lp_scene *scene = task->scene; uint8_t *color[PIPE_MAX_COLOR_BUFS]; + unsigned stride[PIPE_MAX_COLOR_BUFS]; void *depth; unsigned i; @@ -447,15 +481,20 @@ lp_rast_shade_quads_mask(struct lp_rasterizer_task *task, /* color buffer */ for (i = 0; i < scene->fb.nr_cbufs; i++) { - color[i] = lp_rast_get_color_block_pointer(task, i, x, y); - assert(lp_check_alignment(color[i], 16)); + stride[i] = scene->cbufs[i].stride; + + if (scene->cbufs[i].unswizzled) { + color[i] = lp_rast_get_unswizzled_color_block_pointer(task, i, x, y); + } else { + color[i] = lp_rast_get_color_block_pointer(task, i, x, y); + } } /* depth buffer */ depth = lp_rast_get_depth_block_pointer(task, x, y); - assert(lp_check_alignment(state->jit_context.blend_color, 16)); + assert(lp_check_alignment(state->jit_context.u8_blend_color, 16)); /* run shader on 4x4 block */ BEGIN_JIT_CALL(state, task); @@ -468,7 +507,8 @@ lp_rast_shade_quads_mask(struct lp_rasterizer_task *task, color, depth, mask, - &task->vis_counter); + &task->vis_counter, + stride); END_JIT_CALL(); } diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h b/src/gallium/drivers/llvmpipe/lp_rast.h index 49da41f0..315601e 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast.h +++ b/src/gallium/drivers/llvmpipe/lp_rast.h @@ -152,7 +152,7 @@ union lp_rast_cmd_arg { unsigned plane_mask; } triangle; const struct lp_rast_state *set_state; - uint8_t clear_color[4]; + float clear_color[4]; struct { uint32_t value; uint32_t mask; diff --git a/src/gallium/drivers/llvmpipe/lp_rast_priv.h b/src/gallium/drivers/llvmpipe/lp_rast_priv.h index b5d0074..eeb1a94 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast_priv.h +++ b/src/gallium/drivers/llvmpipe/lp_rast_priv.h @@ -189,6 +189,7 @@ lp_rast_get_color_tile_pointer(struct lp_rasterizer_task *task, assert(task->x % TILE_SIZE == 0); assert(task->y % TILE_SIZE == 0); assert(buf < scene->fb.nr_cbufs); + assert(scene->cbufs[buf].unswizzled == 0); if (!task->color_tiles[buf]) { struct pipe_surface *cbuf = scene->fb.cbufs[buf]; @@ -211,6 +212,35 @@ lp_rast_get_color_tile_pointer(struct lp_rasterizer_task *task, /** + * Get pointer to the unswizzled color tile + */ +static INLINE uint8_t * +lp_rast_get_unswizzled_color_tile_pointer(struct lp_rasterizer_task *task, + unsigned buf, enum lp_texture_usage usage) +{ + const struct lp_scene *scene = task->scene; + unsigned format_bytes; + + assert(task->x < scene->tiles_x * TILE_SIZE); + assert(task->y < scene->tiles_y * TILE_SIZE); + assert(task->x % TILE_SIZE == 0); + assert(task->y % TILE_SIZE == 0); + assert(buf < scene->fb.nr_cbufs); + assert(scene->cbufs[buf].unswizzled); + + if (!task->color_tiles[buf]) { + struct pipe_surface *cbuf = scene->fb.cbufs[buf]; + assert(cbuf); + + format_bytes = util_format_description(cbuf->format)->block.bits / 8; + task->color_tiles[buf] = scene->cbufs[buf].map + scene->cbufs[buf].stride * task->y + format_bytes * task->x; + } + + return task->color_tiles[buf]; +} + + +/** * Get the pointer to a 4x4 color block (within a 64x64 tile). * We'll map the color buffer on demand here. * Note that this may be called even when there's no color buffers - return @@ -228,6 +258,8 @@ lp_rast_get_color_block_pointer(struct lp_rasterizer_task *task, assert(y < task->scene->tiles_y * TILE_SIZE); assert((x % TILE_VECTOR_WIDTH) == 0); assert((y % TILE_VECTOR_HEIGHT) == 0); + assert(buf < task->scene->fb.nr_cbufs); + assert(task->scene->cbufs[buf].unswizzled == 0); color = lp_rast_get_color_tile_pointer(task, buf, LP_TEX_USAGE_READ_WRITE); assert(color); @@ -243,6 +275,40 @@ lp_rast_get_color_block_pointer(struct lp_rasterizer_task *task, } +/** + * Get the pointer to an unswizzled 4x4 color block (within an unswizzled 64x64 tile). + * \param x, y location of 4x4 block in window coords + */ +static INLINE uint8_t * +lp_rast_get_unswizzled_color_block_pointer(struct lp_rasterizer_task *task, + unsigned buf, unsigned x, unsigned y) +{ + unsigned px, py, pixel_offset, format_bytes; + uint8_t *color; + + assert(x < task->scene->tiles_x * TILE_SIZE); + assert(y < task->scene->tiles_y * TILE_SIZE); + assert((x % TILE_VECTOR_WIDTH) == 0); + assert((y % TILE_VECTOR_HEIGHT) == 0); + assert(buf < task->scene->fb.nr_cbufs); + assert(task->scene->cbufs[buf].unswizzled); + + format_bytes = util_format_description(task->scene->fb.cbufs[buf]->format)->block.bits / 8; + + color = lp_rast_get_unswizzled_color_tile_pointer(task, buf, LP_TEX_USAGE_READ_WRITE); + assert(color); + + px = x % TILE_SIZE; + py = y % TILE_SIZE; + pixel_offset = px * format_bytes + py * task->scene->cbufs[buf].stride; + + color = color + pixel_offset; + + assert(lp_check_alignment(color, llvmpipe_get_format_alignment(task->scene->fb.cbufs[buf]->format))); + return color; +} + + /** * Shade all pixels in a 4x4 block. The fragment code omits the @@ -258,12 +324,20 @@ lp_rast_shade_quads_all( struct lp_rasterizer_task *task, const struct lp_rast_state *state = task->state; struct lp_fragment_shader_variant *variant = state->variant; uint8_t *color[PIPE_MAX_COLOR_BUFS]; + unsigned stride[PIPE_MAX_COLOR_BUFS]; void *depth; unsigned i; /* color buffer */ - for (i = 0; i < scene->fb.nr_cbufs; i++) - color[i] = lp_rast_get_color_block_pointer(task, i, x, y); + for (i = 0; i < scene->fb.nr_cbufs; i++) { + stride[i] = scene->cbufs[i].stride; + + if (scene->cbufs[i].unswizzled) { + color[i] = lp_rast_get_unswizzled_color_block_pointer(task, i, x, y); + } else { + color[i] = lp_rast_get_color_block_pointer(task, i, x, y); + } + } depth = lp_rast_get_depth_block_pointer(task, x, y); @@ -278,7 +352,8 @@ lp_rast_shade_quads_all( struct lp_rasterizer_task *task, color, depth, 0xffff, - &task->vis_counter ); + &task->vis_counter, + stride ); END_JIT_CALL(); } diff --git a/src/gallium/drivers/llvmpipe/lp_scene.c b/src/gallium/drivers/llvmpipe/lp_scene.c index ed99824..515717b 100644 --- a/src/gallium/drivers/llvmpipe/lp_scene.c +++ b/src/gallium/drivers/llvmpipe/lp_scene.c @@ -150,6 +150,8 @@ lp_scene_begin_rasterization(struct lp_scene *scene) cbuf->u.tex.first_layer, LP_TEX_USAGE_READ_WRITE, LP_TEX_LAYOUT_LINEAR); + + scene->cbufs[i].unswizzled = llvmpipe_is_format_unswizzled(cbuf->format); } if (fb->zsbuf) { diff --git a/src/gallium/drivers/llvmpipe/lp_scene.h b/src/gallium/drivers/llvmpipe/lp_scene.h index 622c522..012fa67 100644 --- a/src/gallium/drivers/llvmpipe/lp_scene.h +++ b/src/gallium/drivers/llvmpipe/lp_scene.h @@ -137,6 +137,7 @@ struct lp_scene { uint8_t *map; unsigned stride; unsigned blocksize; + unsigned unswizzled; } zsbuf, cbufs[PIPE_MAX_COLOR_BUFS]; /** the framebuffer to render the scene into */ diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c index 60144c3..a06acb2 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup.c +++ b/src/gallium/drivers/llvmpipe/lp_setup.c @@ -390,7 +390,7 @@ lp_setup_try_clear( struct lp_setup_context *setup, if (flags & PIPE_CLEAR_COLOR) { for (i = 0; i < 4; i++) - color_arg.clear_color[i] = float_to_ubyte(color[i]); + color_arg.clear_color[i] = color[i]; } if (flags & PIPE_CLEAR_DEPTHSTENCIL) { @@ -805,14 +805,26 @@ try_update_scene_state( struct lp_setup_context *setup ) if(setup->dirty & LP_SETUP_NEW_BLEND_COLOR) { uint8_t *stored; + float* fstored; unsigned i, j; + unsigned size; + + /* Alloc u8_blend_color (16 x i8) and f_blend_color (4 or 8 x f32) */ + size = 4 * 16 * sizeof(uint8_t); + size += (LP_MAX_VECTOR_LENGTH / 4) * sizeof(float); + stored = lp_scene_alloc_aligned(scene, size, LP_MAX_VECTOR_LENGTH); - stored = lp_scene_alloc_aligned(scene, 4 * 16, 16); if (!stored) { assert(!new_scene); return FALSE; } + /* Store floating point colour */ + fstored = (float*)(stored + 4*16); + for (i = 0; i < (LP_MAX_VECTOR_LENGTH / 4); ++i) { + fstored[i] = setup->blend_color.current.color[i % 4]; + } + /* smear each blend color component across 16 ubyte elements */ for (i = 0; i < 4; ++i) { uint8_t c = float_to_ubyte(setup->blend_color.current.color[i]); @@ -821,7 +833,8 @@ try_update_scene_state( struct lp_setup_context *setup ) } setup->blend_color.stored = stored; - setup->fs.current.jit_context.blend_color = setup->blend_color.stored; + setup->fs.current.jit_context.u8_blend_color = stored; + setup->fs.current.jit_context.f_blend_color = fstored; setup->dirty |= LP_SETUP_NEW_FS; } diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c index da20ded..bf0217d 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_fs.c +++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c @@ -82,6 +82,10 @@ #include "gallivm/lp_bld_swizzle.h" #include "gallivm/lp_bld_flow.h" #include "gallivm/lp_bld_debug.h" +#include "gallivm/lp_bld_arit.h" +#include "gallivm/lp_bld_pack.h" +#include "gallivm/lp_bld_format.h" +#include "gallivm/lp_bld_quad.h" #include "lp_bld_alpha.h" #include "lp_bld_blend.h" @@ -719,7 +723,7 @@ generate_blend(struct gallivm_state *gallivm, vec_type = lp_build_vec_type(gallivm, type); - const_ptr = lp_jit_context_blend_color(gallivm, context_ptr); + const_ptr = lp_jit_context_u8_blend_color(gallivm, context_ptr); const_ptr = LLVMBuildBitCast(builder, const_ptr, LLVMPointerType(vec_type, 0), ""); @@ -752,6 +756,1010 @@ generate_blend(struct gallivm_state *gallivm, /** + * This function will reorder pixels from the fragment shader SoA to memory layout AoS + * + * Fragment Shader outputs pixels in small 2x2 blocks + * e.g. (0, 0), (1, 0), (0, 1), (1, 1) ; (2, 0) ... + * + * However in memory pixels are stored in rows + * e.g. (0, 0), (1, 0), (2, 0), (3, 0) ; (0, 1) ... + * + * @param type fragment shader type (4x or 8x float) + * @param num_fs number of fs_src + * @param dst_channels number of output channels + * @param fs_src output from fragment shader + * @param dst pointer to store result + * @param pad_inline is channel padding inline or at end of row + * @return the number of dsts + */ +static int +generate_fs_twiddle(struct gallivm_state *gallivm, + struct lp_type type, + unsigned num_fs, + unsigned dst_channels, + LLVMValueRef fs_src[][4], + LLVMValueRef* dst, + bool pad_inline) +{ + LLVMValueRef src[16]; + + bool swizzle_pad; + bool twiddle; + bool split; + + unsigned pixels = num_fs == 4 ? 1 : 2; + unsigned reorder_group; + unsigned src_channels; + unsigned src_count; + unsigned i; + + src_channels = dst_channels < 3 ? dst_channels : 4; + src_count = num_fs * src_channels; + + assert(pixels == 2 || num_fs == 4); + assert(num_fs * src_channels <= Elements(src)); + + /* + * Transpose from SoA -> AoS + */ + for (i = 0; i < num_fs; ++i) { + lp_build_transpose_aos_n(gallivm, type, &fs_src[i][0], src_channels, &src[i * src_channels]); + } + + /* + * Pick transformation options + */ + swizzle_pad = false; + twiddle = false; + split = false; + reorder_group = 0; + + if (dst_channels == 1) { + twiddle = true; + + if (pixels == 2) { + split = true; + } + } else if (dst_channels == 2) { + if (pixels == 1) { + reorder_group = 1; + } + } else if (dst_channels > 2) { + if (pixels == 1) { + reorder_group = 2; + } else { + twiddle = true; + } + + if (!pad_inline && dst_channels == 3 && pixels > 1) { + swizzle_pad = true; + } + } + + /* + * Split the src in half + */ + if (split) { + for (i = num_fs; i > 0; --i) { + src[(i - 1)*2 + 1] = lp_build_extract_range(gallivm, src[i - 1], 4, 4); + src[(i - 1)*2 + 0] = lp_build_extract_range(gallivm, src[i - 1], 0, 4); + } + + src_count *= 2; + type.length = 4; + } + + /* + * Ensure pixels are in memory order + */ + if (reorder_group) { + /* Twiddle pixels by reordering the array, e.g.: + * + * src_count = 8 -> 0 2 1 3 4 6 5 7 + * src_count = 16 -> 0 1 4 5 2 3 6 7 8 9 12 13 10 11 14 15 + */ + const unsigned reorder_sw[] = { 0, 2, 1, 3 }; + + for (i = 0; i < src_count; ++i) { + unsigned group = i / reorder_group; + unsigned block = (group / 4) * 4 * reorder_group; + unsigned j = block + (reorder_sw[group % 4] * reorder_group) + (i % reorder_group); + dst[i] = src[j]; + } + } else if (twiddle) { + /* Twiddle pixels across elements of array */ + lp_bld_quad_twiddle(gallivm, type, src, src_count, dst); + } else { + /* Do nothing */ + memcpy(dst, src, sizeof(LLVMValueRef) * src_count); + } + + /* + * Moves any padding between pixels to the end + * e.g. RGBXRGBX -> RGBRGBXX + */ + if (swizzle_pad) { + unsigned char swizzles[16]; + unsigned elems = pixels * dst_channels; + + for (i = 0; i < type.length; ++i) { + if (i < elems) + swizzles[i] = i % dst_channels + (i / dst_channels) * 4; + else + swizzles[i] = LP_BLD_SWIZZLE_DONTCARE; + } + + for (i = 0; i < src_count; ++i) { + dst[i] = lp_build_swizzle_aos_n(gallivm, dst[i], swizzles, type.length, type.length); + } + } + + return src_count; +} + + +/** + * Load an unswizzled block of pixels from memory + */ +static void +load_unswizzled_block(struct gallivm_state *gallivm, + LLVMValueRef base_ptr, + LLVMValueRef stride, + unsigned block_width, + unsigned block_height, + LLVMValueRef* dst, + struct lp_type dst_type, + unsigned dst_count) +{ + LLVMBuilderRef builder = gallivm->builder; + unsigned row_size = dst_count / block_height; + unsigned i; + + /* Ensure block exactly fits into dst */ + assert((block_width * block_height) % dst_count == 0); + + for (i = 0; i < dst_count; ++i) { + unsigned x = i % row_size; + unsigned y = i / row_size; + + LLVMValueRef bx = lp_build_const_int32(gallivm, x * (dst_type.width / 8) * dst_type.length); + LLVMValueRef by = LLVMBuildMul(builder, lp_build_const_int32(gallivm, y), stride, ""); + + LLVMValueRef gep[2]; + LLVMValueRef dst_ptr; + + gep[0] = lp_build_const_int32(gallivm, 0); + gep[1] = LLVMBuildAdd(builder, bx, by, ""); + + dst_ptr = LLVMBuildGEP(builder, base_ptr, gep, 2, ""); + dst_ptr = LLVMBuildBitCast(builder, dst_ptr, LLVMPointerType(lp_build_vec_type(gallivm, dst_type), 0), ""); + + dst[i] = LLVMBuildLoad(builder, dst_ptr, ""); + + if ((dst_type.length % 3) == 0) { + lp_set_load_alignment(dst[i], dst_type.width / 8); + } + } +} + + +/** + * Store an unswizzled block of pixels to memory + */ +static void +store_unswizzled_block(struct gallivm_state *gallivm, + LLVMValueRef base_ptr, + LLVMValueRef stride, + unsigned block_width, + unsigned block_height, + LLVMValueRef* src, + struct lp_type src_type, + unsigned src_count) +{ + LLVMBuilderRef builder = gallivm->builder; + unsigned row_size = src_count / block_height; + unsigned i; + + /* Ensure src exactly fits into block */ + assert((block_width * block_height) % src_count == 0); + + for (i = 0; i < src_count; ++i) { + unsigned x = i % row_size; + unsigned y = i / row_size; + + LLVMValueRef bx = lp_build_const_int32(gallivm, x * (src_type.width / 8) * src_type.length); + LLVMValueRef by = LLVMBuildMul(builder, lp_build_const_int32(gallivm, y), stride, ""); + + LLVMValueRef gep[2]; + LLVMValueRef src_ptr; + + gep[0] = lp_build_const_int32(gallivm, 0); + gep[1] = LLVMBuildAdd(builder, bx, by, ""); + + src_ptr = LLVMBuildGEP(builder, base_ptr, gep, 2, ""); + src_ptr = LLVMBuildBitCast(builder, src_ptr, LLVMPointerType(lp_build_vec_type(gallivm, src_type), 0), ""); + + src_ptr = LLVMBuildStore(builder, src[i], src_ptr); + + if ((src_type.length % 3) == 0) { + lp_set_store_alignment(src_ptr, src_type.width / 8); + } + } +} + + +/** + * Checks if a format description is an arithmetic format + * + * A format which has irregular channel sizes such as R3_G3_B2 or R5_G6_B5. + */ +static INLINE boolean +is_arithmetic_format(const struct util_format_description *format_desc) +{ + boolean arith = false; + unsigned i; + + for (i = 0; i < format_desc->nr_channels; ++i) { + arith |= format_desc->channel[i].size != format_desc->channel[0].size; + arith |= (format_desc->channel[i].size % 8) != 0; + } + + return arith; +} + + +/** + * Retrieves the type representing the memory layout for a format + * + * e.g. RGBA16F = 4x half-float and R3G3B2 = 1x byte + */ +static INLINE void +lp_mem_type_from_format_desc(const struct util_format_description *format_desc, + struct lp_type* type) +{ + int i; + + memset(type, 0, sizeof(struct lp_type)); + type->floating = format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT; + type->fixed = format_desc->channel[0].type == UTIL_FORMAT_TYPE_FIXED; + type->sign = format_desc->channel[0].type != UTIL_FORMAT_TYPE_UNSIGNED; + type->norm = format_desc->channel[0].normalized; + + if (is_arithmetic_format(format_desc)) { + type->width = 0; + type->length = 1; + + for (i = 0; i < format_desc->nr_channels; ++i) { + type->width += format_desc->channel[i].size; + } + } else { + type->width = format_desc->channel[0].size; + type->length = format_desc->nr_channels; + } +} + + +/** + * Retrieves the type for a format which is usable in the blending code. + * + * e.g. RGBA16F = 4x float, R3G3B2 = 3x byte + */ +static INLINE void +lp_blend_type_from_format_desc(const struct util_format_description *format_desc, + struct lp_type* type) +{ + int i; + + memset(type, 0, sizeof(struct lp_type)); + type->floating = format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT; + type->fixed = format_desc->channel[0].type == UTIL_FORMAT_TYPE_FIXED; + type->sign = format_desc->channel[0].type != UTIL_FORMAT_TYPE_UNSIGNED; + type->norm = format_desc->channel[0].normalized; + type->width = format_desc->channel[0].size; + type->length = format_desc->nr_channels; + + for (i = 1; i < format_desc->nr_channels; ++i) { + if (format_desc->channel[i].size > type->width) + type->width = format_desc->channel[i].size; + } + + if (type->floating) { + type->width = 32; + } else { + if (type->width <= 8) { + type->width = 8; + } else if (type->width <= 16) { + type->width = 16; + } else { + type->width = 32; + } + } + + if (is_arithmetic_format(format_desc) && type->length == 3) { + type->length = 4; + } +} + + +/** + * Scale a normalised value from src_bits to dst_bits + */ +static INLINE LLVMValueRef +scale_bits(struct gallivm_state *gallivm, + int src_bits, + int dst_bits, + LLVMValueRef src, + struct lp_type src_type) +{ + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef result = src; + + if (dst_bits < src_bits) { + /* Scale down by LShr */ + result = LLVMBuildLShr(builder, + src, + lp_build_const_int_vec(gallivm, src_type, src_bits - dst_bits), + ""); + } else if (dst_bits > src_bits) { + /* Scale up bits */ + int db = dst_bits - src_bits; + + /* Shift left by difference in bits */ + result = LLVMBuildShl(builder, + src, + lp_build_const_int_vec(gallivm, src_type, db), + ""); + + if (db < src_bits) { + /* Enough bits in src to fill the remainder */ + LLVMValueRef lower = LLVMBuildLShr(builder, + src, + lp_build_const_int_vec(gallivm, src_type, src_bits - db), + ""); + + result = LLVMBuildOr(builder, result, lower, ""); + } else if (db > src_bits) { + /* Need to repeatedely copy src bits to fill remainder in dst */ + unsigned n; + + for (n = src_bits; n < dst_bits; n *= 2) { + LLVMValueRef shuv = lp_build_const_int_vec(gallivm, src_type, n); + + result = LLVMBuildOr(builder, + result, + LLVMBuildLShr(builder, result, shuv, ""), + ""); + } + } + } + + return result; +} + + +/** + * Convert from memory format to blending format + * + * e.g. GL_R3G3B2 is 1 byte in memory but 3 bytes for blending + */ +static void +convert_to_blend_type(struct gallivm_state *gallivm, + const struct util_format_description *src_fmt, + struct lp_type src_type, + struct lp_type dst_type, + LLVMValueRef* src, + unsigned num_srcs, + LLVMValueRef* dst) +{ + LLVMBuilderRef builder = gallivm->builder; + struct lp_type blend_type; + struct lp_type mem_type; + unsigned i, j, k; + unsigned pixels = 16 / num_srcs; + bool is_arith; + + memcpy(dst, src, sizeof(LLVMValueRef*) * num_srcs); + + lp_mem_type_from_format_desc(src_fmt, &mem_type); + lp_blend_type_from_format_desc(src_fmt, &blend_type); + + /* Is the format arithmetic */ + is_arith = blend_type.length * blend_type.width != mem_type.width * mem_type.length; + is_arith &= !(mem_type.width == 16 && mem_type.floating); + + /* Pad if necessary */ + if (!is_arith && src_type.length < dst_type.length) { + for (i = 0; i < num_srcs; ++i) { + dst[i] = lp_build_pad_vector(gallivm, src[i], dst_type.length); + } + + src_type.length = dst_type.length; + } + + /* Special case for half-floats */ + if (mem_type.width == 16 && mem_type.floating) { + assert(blend_type.width == 32 && blend_type.floating); + lp_build_conv_auto(gallivm, src_type, &dst_type, dst, num_srcs, dst); + is_arith = false; + } + + if (!is_arith) { + return; + } + + src_type.width = blend_type.width * blend_type.length; + blend_type.length *= pixels; + src_type.length *= pixels / (src_type.length / mem_type.length); + + for (i = 0; i < num_srcs; ++i) { + LLVMValueRef chans[4]; + LLVMValueRef res; + unsigned sa = 0; + + dst[i] = LLVMBuildZExt(builder, src[i], lp_build_vec_type(gallivm, src_type), ""); + + for (j = 0; j < src_fmt->nr_channels; ++j) { + unsigned mask = 0; + + for (k = 0; k < src_fmt->channel[j].size; ++k) { + mask |= 1 << k; + } + + /* Extract bits from source */ + chans[j] = LLVMBuildLShr(builder, + dst[i], + lp_build_const_int_vec(gallivm, src_type, sa), + ""); + + chans[j] = LLVMBuildAnd(builder, + chans[j], + lp_build_const_int_vec(gallivm, src_type, mask), + ""); + + /* Scale bits */ + chans[j] = scale_bits(gallivm, src_fmt->channel[j].size, blend_type.width, chans[j], src_type); + + /* Insert bits into correct position */ + chans[j] = LLVMBuildShl(builder, + chans[j], + lp_build_const_int_vec(gallivm, src_type, j * blend_type.width), + ""); + + sa += src_fmt->channel[j].size; + + if (j == 0) { + res = chans[j]; + } else { + res = LLVMBuildOr(builder, res, chans[j], ""); + } + } + + dst[i] = LLVMBuildBitCast(builder, res, lp_build_vec_type(gallivm, blend_type), ""); + } +} + + +/** + * Convert from blending format to memory format + * + * e.g. GL_R3G3B2 is 3 bytes for blending but 1 byte in memory + */ +static void +convert_from_blend_type(struct gallivm_state *gallivm, + const struct util_format_description *src_fmt, + struct lp_type src_type, + struct lp_type dst_type, + LLVMValueRef* src, + unsigned num_srcs, + LLVMValueRef* dst) +{ + unsigned i, j, k; + struct lp_type mem_type; + struct lp_type blend_type; + LLVMBuilderRef builder = gallivm->builder; + unsigned pixels = 16 / num_srcs; + bool is_arith; + + memcpy(dst, src, sizeof(LLVMValueRef*) * num_srcs); + + lp_mem_type_from_format_desc(src_fmt, &mem_type); + lp_blend_type_from_format_desc(src_fmt, &blend_type); + + is_arith = (blend_type.length * blend_type.width != mem_type.width * mem_type.length); + + /* Special case for half-floats */ + if (mem_type.width == 16 && mem_type.floating) { + int length = dst_type.length; + assert(blend_type.width == 32 && blend_type.floating); + + dst_type.length = src_type.length; + + lp_build_conv_auto(gallivm, src_type, &dst_type, dst, num_srcs, dst); + + dst_type.length = length; + is_arith = false; + } + + /* Remove any padding */ + if (!is_arith && (src_type.length % mem_type.length)) { + src_type.length -= (src_type.length % mem_type.length); + + for (i = 0; i < num_srcs; ++i) { + dst[i] = lp_build_extract_range(gallivm, dst[i], 0, src_type.length); + } + } + + /* No bit arithmitic to do */ + if (!is_arith) { + return; + } + + src_type.length = pixels; + src_type.width = blend_type.length * blend_type.width; + dst_type.length = pixels; + + for (i = 0; i < num_srcs; ++i) { + LLVMValueRef chans[4]; + LLVMValueRef res; + unsigned sa = 0; + + dst[i] = LLVMBuildBitCast(builder, src[i], lp_build_vec_type(gallivm, src_type), ""); + + for (j = 0; j < src_fmt->nr_channels; ++j) { + unsigned mask = 0; + + assert(blend_type.width > src_fmt->channel[j].size); + + for (k = 0; k < blend_type.width; ++k) { + mask |= 1 << k; + } + + /* Extract bits */ + chans[j] = LLVMBuildLShr(builder, + dst[i], + lp_build_const_int_vec(gallivm, src_type, j * blend_type.width), + ""); + + chans[j] = LLVMBuildAnd(builder, + chans[j], + lp_build_const_int_vec(gallivm, src_type, mask), + ""); + + /* Scale down bits */ + chans[j] = scale_bits(gallivm, blend_type.width, src_fmt->channel[j].size, chans[j], src_type); + + /* Insert bits */ + chans[j] = LLVMBuildShl(builder, + chans[j], + lp_build_const_int_vec(gallivm, src_type, sa), + ""); + + sa += src_fmt->channel[j].size; + + if (j == 0) { + res = chans[j]; + } else { + res = LLVMBuildOr(builder, res, chans[j], ""); + } + } + + assert (dst_type.width != 24); + + dst[i] = LLVMBuildTrunc(builder, res, lp_build_vec_type(gallivm, dst_type), ""); + } +} + + +/** + * Generates the blend function for unswizzled colour buffers + * Also generates the read & write from colour buffer + */ +static void +generate_unswizzled_blend(struct gallivm_state *gallivm, + unsigned rt, + struct lp_fragment_shader_variant *variant, + enum pipe_format out_format, + unsigned int num_fs, + struct lp_type fs_type, + LLVMValueRef* fs_mask, + LLVMValueRef fs_out_color[TGSI_NUM_CHANNELS][4], + LLVMValueRef context_ptr, + LLVMValueRef color_ptr, + LLVMValueRef stride, + unsigned partial_mask, + boolean do_branch) +{ + const unsigned alpha_channel = 3; + const unsigned block_width = 4; + const unsigned block_height = 4; + const unsigned block_size = block_width * block_height; + const unsigned lp_integer_vector_width = 128; + + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef fs_src[4][TGSI_NUM_CHANNELS]; + LLVMValueRef src_alpha[block_size]; + LLVMValueRef src_mask[block_size]; + LLVMValueRef src[block_size]; + LLVMValueRef dst[block_size]; + LLVMValueRef blend_color; + LLVMValueRef blend_alpha; + LLVMValueRef i32_zero; + LLVMValueRef check_mask; + + struct lp_build_mask_context mask_ctx; + struct lp_type mask_type; + struct lp_type blend_type; + struct lp_type alpha_type; + struct lp_type row_type; + struct lp_type dst_type; + + unsigned char swizzle[TGSI_NUM_CHANNELS]; + unsigned vector_width; + unsigned dst_channels; + unsigned src_channels; + unsigned dst_count; + unsigned src_count; + unsigned i, j; + + const struct util_format_description* out_format_desc = util_format_description(out_format); + + bool pad_inline = is_arithmetic_format(out_format_desc); + bool has_alpha = false; + + src_channels = TGSI_NUM_CHANNELS; + mask_type = lp_int32_vec4_type(); + mask_type.length = fs_type.length; + + /* Do not bother executing code when mask is empty.. */ + if (do_branch) { + check_mask = LLVMConstNull(lp_build_int_vec_type(gallivm, mask_type)); + + for (i = 0; i < num_fs; ++i) { + check_mask = LLVMBuildOr(builder, check_mask, fs_mask[i], ""); + } + + lp_build_mask_begin(&mask_ctx, gallivm, mask_type, check_mask); + lp_build_mask_check(&mask_ctx); + } + + partial_mask |= !variant->opaque; + i32_zero = lp_build_const_int32(gallivm, 0); + + /* Get type from output format */ + lp_blend_type_from_format_desc(out_format_desc, &row_type); + lp_mem_type_from_format_desc(out_format_desc, &dst_type); + + row_type.length = fs_type.length; + vector_width = dst_type.floating ? lp_native_vector_width : lp_integer_vector_width; + + /* Compute correct swizzle and count channels */ + memset(swizzle, 0xFF, TGSI_NUM_CHANNELS); + dst_channels = 0; + + for (i = 0; i < TGSI_NUM_CHANNELS; ++i) { + /* Ensure channel is used */ + if (out_format_desc->swizzle[i] >= TGSI_NUM_CHANNELS) { + continue; + } + + /* Ensure not already written to (happens in case with GL_ALPHA) */ + if (swizzle[out_format_desc->swizzle[i]] < TGSI_NUM_CHANNELS) { + continue; + } + + /* Ensure we havn't already found all channels */ + if (dst_channels >= out_format_desc->nr_channels) { + continue; + } + + swizzle[out_format_desc->swizzle[i]] = i; + ++dst_channels; + + if (i == alpha_channel) { + has_alpha = true; + } + } + + /* If 3 channels then pad to include alpha for 4 element transpose */ + if (dst_channels == 3 && !has_alpha) { + swizzle[3] = 3; + + if (out_format_desc->nr_channels == 4) { + dst_channels = 4; + } + } + + /* + * Load shader output + */ + for (i = 0; i < num_fs; ++i) { + /* Always load alpha for use in blending */ + LLVMValueRef alpha = LLVMBuildLoad(builder, fs_out_color[alpha_channel][i], ""); + + /* Load each channel */ + for (j = 0; j < dst_channels; ++j) { + fs_src[i][j] = LLVMBuildLoad(builder, fs_out_color[swizzle[j]][i], ""); + } + + /* If 3 channels then pad to include alpha for 4 element transpose */ + if (dst_channels == 3 && !has_alpha) { + fs_src[i][3] = alpha; + swizzle[3] = 3; + } + + /* We split the row_mask and row_alpha as we want 128bit interleave */ + if (fs_type.length == 8) { + src_mask[i*2 + 0] = lp_build_extract_range(gallivm, fs_mask[i], 0, src_channels); + src_mask[i*2 + 1] = lp_build_extract_range(gallivm, fs_mask[i], src_channels, src_channels); + + src_alpha[i*2 + 0] = lp_build_extract_range(gallivm, alpha, 0, src_channels); + src_alpha[i*2 + 1] = lp_build_extract_range(gallivm, alpha, src_channels, src_channels); + } else { + src_mask[i] = fs_mask[i]; + src_alpha[i] = alpha; + } + } + + + /* + * Pixel twiddle from fragment shader order to memory order + */ + src_count = generate_fs_twiddle(gallivm, fs_type, num_fs, dst_channels, fs_src, src, pad_inline); + src_channels = dst_channels < 3 ? dst_channels : 4; + if (src_count != num_fs * src_channels) { + unsigned ds = src_count / (num_fs * src_channels); + row_type.length /= ds; + fs_type.length = row_type.length; + } + + blend_type = row_type; + alpha_type = fs_type; + alpha_type.length = 4; + mask_type.length = 4; + + /* Convert src to row_type */ + src_count = lp_build_conv_auto(gallivm, fs_type, &row_type, src, src_count, src); + + /* If the rows are not an SSE vector, combine them to become SSE size! */ + if ((row_type.width * row_type.length) % 128) { + unsigned bits = row_type.width * row_type.length; + unsigned combined; + + dst_count = src_count / (vector_width / bits); + combined = lp_build_concat_n(gallivm, row_type, src, src_count, src, dst_count); + + row_type.length *= combined; + src_count /= combined; + + bits = row_type.width * row_type.length; + assert(bits == 128 || bits == 256); + } + + + /* + * Blend Colour conversion + */ + blend_color = lp_jit_context_f_blend_color(gallivm, context_ptr); + blend_color = LLVMBuildPointerCast(builder, blend_color, LLVMPointerType(lp_build_vec_type(gallivm, fs_type), 0), ""); + blend_color = LLVMBuildLoad(builder, LLVMBuildGEP(builder, blend_color, &i32_zero, 1, ""), ""); + + /* Convert */ + lp_build_conv(gallivm, fs_type, blend_type, &blend_color, 1, &blend_color, 1); + + /* Extract alpha */ + blend_alpha = lp_build_extract_broadcast(gallivm, blend_type, row_type, blend_color, lp_build_const_int32(gallivm, 3)); + + /* Swizzle to appropriate channels, e.g. from RGBA to BGRA BGRA */ + pad_inline &= (dst_channels * (block_size / src_count) * row_type.width) != vector_width; + if (pad_inline) { + /* Use all 4 channels e.g. from RGBA RGBA to RGxx RGxx */ + blend_color = lp_build_swizzle_aos_n(gallivm, blend_color, swizzle, TGSI_NUM_CHANNELS, row_type.length); + } else { + /* Only use dst_channels e.g. RGBA RGBA to RG RG xxxx */ + blend_color = lp_build_swizzle_aos_n(gallivm, blend_color, swizzle, dst_channels, row_type.length); + } + + /* + * Mask conversion + */ + lp_bld_quad_twiddle(gallivm, mask_type, &src_mask[0], 4, &src_mask[0]); + + if (src_count < block_height) { + lp_build_concat_n(gallivm, mask_type, src_mask, 4, src_mask, src_count); + } else if (src_count > block_height) { + for (i = src_count; i > 0; --i) { + unsigned pixels = block_size / src_count; + unsigned idx = i - 1; + + src_mask[idx] = lp_build_extract_range(gallivm, src_mask[(idx * pixels) / 4], (idx * pixels) % 4, pixels); + } + } + + assert(mask_type.width == 32); + + for (i = 0; i < src_count; ++i) { + unsigned pixels = block_size / src_count; + unsigned pixel_width = row_type.width * dst_channels; + + if (pixel_width == 24) { + mask_type.width = 8; + mask_type.length = vector_width / mask_type.width; + } else { + mask_type.length = pixels; + mask_type.width = row_type.width * dst_channels; + + src_mask[i] = LLVMBuildIntCast(builder, src_mask[i], lp_build_int_vec_type(gallivm, mask_type), ""); + + mask_type.length *= dst_channels; + mask_type.width /= dst_channels; + } + + src_mask[i] = LLVMBuildBitCast(builder, src_mask[i], lp_build_int_vec_type(gallivm, mask_type), ""); + src_mask[i] = lp_build_pad_vector(gallivm, src_mask[i], row_type.length); + } + + /* + * Alpha conversion + */ + if (!has_alpha) { + unsigned length = row_type.length; + row_type.length = alpha_type.length; + + /* Twiddle the alpha to match pixels */ + lp_bld_quad_twiddle(gallivm, alpha_type, src_alpha, 4, src_alpha); + + for (i = 0; i < 4; ++i) { + lp_build_conv(gallivm, alpha_type, row_type, &src_alpha[i], 1, &src_alpha[i], 1); + } + + alpha_type = row_type; + row_type.length = length; + + /* If only one channel we can only need the single alpha value per pixel */ + if (src_count == 1) { + assert(dst_channels == 1); + + lp_build_concat_n(gallivm, alpha_type, src_alpha, 4, src_alpha, src_count); + } else { + /* If there are more srcs than rows then we need to split alpha up */ + if (src_count > block_height) { + for (i = src_count; i > 0; --i) { + unsigned pixels = block_size / src_count; + unsigned idx = i - 1; + + src_alpha[idx] = lp_build_extract_range(gallivm, src_alpha[(idx * pixels) / 4], (idx * pixels) % 4, pixels); + } + } + + /* If there is a src for each pixel broadcast the alpha across whole row */ + if (src_count == block_size) { + for (i = 0; i < src_count; ++i) { + src_alpha[i] = lp_build_broadcast(gallivm, lp_build_vec_type(gallivm, row_type), src_alpha[i]); + } + } else { + unsigned pixels = block_size / src_count; + unsigned channels = pad_inline ? TGSI_NUM_CHANNELS : dst_channels; + unsigned alpha_span = 1; + + /* Check if we need 2 src_alphas for our shuffles */ + if (pixels > alpha_type.length) { + alpha_span = 2; + } + + /* Broadcast alpha across all channels, e.g. a1a2 to a1a1a1a1a2a2a2a2 */ + for (i = 0; i < src_count; ++i) { + LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH]; + unsigned idx1 = i, idx2 = i; + + if (alpha_span > 1){ + idx1 *= alpha_span; + idx2 = idx1 + 1; + } + + for (j = 0; j < row_type.length; ++j) { + if (j < pixels * channels) { + shuffles[j] = lp_build_const_int32(gallivm, j / channels); + } else { + shuffles[j] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context)); + } + } + + src_alpha[i] = LLVMBuildShuffleVector(builder, + src_alpha[idx1], + src_alpha[idx2], + LLVMConstVector(shuffles, row_type.length), + ""); + } + } + } + } + + + /* + * Load dst from memory + */ + if (src_count < block_height) { + dst_count = block_height; + } else { + dst_count = src_count; + } + + dst_type.length *= 16 / dst_count; + + load_unswizzled_block(gallivm, color_ptr, stride, block_width, block_height, dst, dst_type, dst_count); + + + /* + * Convert from dst/output format to src/blending format. + * + * This is necessary as we can only read 1 row from memory at a time, + * so the minimum dst_count will ever be at this point is 4. + * + * With, for example, R8 format you can have all 16 pixels in a 128 bit vector, + * this will take the 4 dsts and combine them into 1 src so we can perform blending + * on all 16 pixels in that single vector at once. + */ + if (dst_count > src_count) { + lp_build_concat_n(gallivm, dst_type, dst, 4, dst, src_count); + } + + /* + * Blending + */ + convert_to_blend_type(gallivm, out_format_desc, dst_type, row_type, dst, src_count, dst); + + for (i = 0; i < src_count; ++i) { + dst[i] = lp_build_blend_aos(gallivm, + &variant->key.blend, + variant->key.cbuf_format, + row_type, + rt, + src[i], + has_alpha ? NULL : src_alpha[i], + dst[i], + partial_mask ? src_mask[i] : NULL, + blend_color, + has_alpha ? NULL : blend_alpha, + swizzle, + pad_inline ? 4 : dst_channels); + } + + convert_from_blend_type(gallivm, out_format_desc, row_type, dst_type, dst, src_count, dst); + + /* Split the blend rows back to memory rows */ + if (dst_count > src_count) { + row_type.length = dst_type.length * (dst_count / src_count); + + if (src_count == 1) { + dst[1] = lp_build_extract_range(gallivm, dst[0], row_type.length / 2, row_type.length / 2); + dst[0] = lp_build_extract_range(gallivm, dst[0], 0, row_type.length / 2); + + row_type.length /= 2; + src_count *= 2; + } + + dst[3] = lp_build_extract_range(gallivm, dst[1], row_type.length / 2, row_type.length / 2); + dst[2] = lp_build_extract_range(gallivm, dst[1], 0, row_type.length / 2); + dst[1] = lp_build_extract_range(gallivm, dst[0], row_type.length / 2, row_type.length / 2); + dst[0] = lp_build_extract_range(gallivm, dst[0], 0, row_type.length / 2); + + row_type.length /= 2; + src_count *= 2; + } + + + /* + * Store blend result to memory + */ + store_unswizzled_block(gallivm, color_ptr, stride, block_width, block_height, dst, dst_type, dst_count); + + if (do_branch) { + lp_build_mask_end(&mask_ctx); + } +} + + +/** * Generate the runtime callable function for the whole fragment pipeline. * Note that the function which we generate operates on a block of 16 * pixels at at time. The block contains 2x2 quads. Each quad contains @@ -771,7 +1779,7 @@ generate_fragment(struct llvmpipe_context *lp, struct lp_type blend_type; LLVMTypeRef fs_elem_type; LLVMTypeRef blend_vec_type; - LLVMTypeRef arg_types[11]; + LLVMTypeRef arg_types[12]; LLVMTypeRef func_type; LLVMTypeRef int32_type = LLVMInt32TypeInContext(gallivm->context); LLVMTypeRef int8_type = LLVMInt8TypeInContext(gallivm->context); @@ -782,6 +1790,7 @@ generate_fragment(struct llvmpipe_context *lp, LLVMValueRef dadx_ptr; LLVMValueRef dady_ptr; LLVMValueRef color_ptr_ptr; + LLVMValueRef stride_ptr; LLVMValueRef depth_ptr; LLVMValueRef mask_input; LLVMValueRef counter = NULL; @@ -867,6 +1876,7 @@ generate_fragment(struct llvmpipe_context *lp, arg_types[8] = LLVMPointerType(int8_type, 0); /* depth */ arg_types[9] = int32_type; /* mask_input */ arg_types[10] = LLVMPointerType(int32_type, 0); /* counter */ + arg_types[11] = LLVMPointerType(int32_type, 0); /* stride */ func_type = LLVMFunctionType(LLVMVoidTypeInContext(gallivm->context), arg_types, Elements(arg_types), 0); @@ -893,6 +1903,7 @@ generate_fragment(struct llvmpipe_context *lp, color_ptr_ptr = LLVMGetParam(function, 7); depth_ptr = LLVMGetParam(function, 8); mask_input = LLVMGetParam(function, 9); + stride_ptr = LLVMGetParam(function, 11); lp_build_name(context_ptr, "context"); lp_build_name(x, "x"); @@ -903,6 +1914,7 @@ generate_fragment(struct llvmpipe_context *lp, lp_build_name(color_ptr_ptr, "color_ptr_ptr"); lp_build_name(depth_ptr, "depth"); lp_build_name(mask_input, "mask_input"); + lp_build_name(stride_ptr, "stride_ptr"); if (key->occlusion_count) { counter = LLVMGetParam(function, 10); @@ -1048,54 +2060,56 @@ generate_fragment(struct llvmpipe_context *lp, LLVMValueRef color_ptr; LLVMValueRef index = lp_build_const_int32(gallivm, cbuf); LLVMValueRef blend_in_color[TGSI_NUM_CHANNELS]; - unsigned rt; - - /* - * Convert the fs's output color and mask to fit to the blending type. - */ - for(chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) { - LLVMValueRef fs_color_vals[LP_MAX_VECTOR_LENGTH]; - - for (i = 0; i < num_fs; i++) { - fs_color_vals[i] = - LLVMBuildLoad(builder, fs_out_color[cbuf][chan][i], "fs_color_vals"); - } - - lp_build_conv(gallivm, fs_type, blend_type, - fs_color_vals, - num_fs, - &blend_in_color[chan], 1); + unsigned rt = key->blend.independent_blend_enable ? cbuf : 0; - lp_build_name(blend_in_color[chan], "color%d.%c", cbuf, "rgba"[chan]); - } - - if (partial_mask || !variant->opaque) { - lp_build_conv_mask(variant->gallivm, fs_type, blend_type, - fs_mask, num_fs, - &blend_mask, 1); - } else { - blend_mask = lp_build_const_int_vec(variant->gallivm, blend_type, ~0); - } + boolean do_branch = ((key->depth.enabled + || key->stencil[0].enabled + || key->alpha.enabled) + && !shader->info.base.uses_kill); - color_ptr = LLVMBuildLoad(builder, + color_ptr = LLVMBuildLoad(builder, LLVMBuildGEP(builder, color_ptr_ptr, &index, 1, ""), ""); + lp_build_name(color_ptr, "color_ptr%d", cbuf); - /* which blend/colormask state to use */ - rt = key->blend.independent_blend_enable ? cbuf : 0; + if (variant->unswizzled_cbufs & (1 << cbuf)) { + LLVMValueRef stride = LLVMBuildLoad(builder, + LLVMBuildGEP(builder, stride_ptr, &index, 1, ""), + ""); - /* - * Blending. - */ - { - /* Could the 4x4 have been killed? + generate_unswizzled_blend(gallivm, rt, variant, key->cbuf_format[cbuf], + num_fs, fs_type, fs_mask, fs_out_color[cbuf], + context_ptr, color_ptr, stride, partial_mask, do_branch); + } else { + /* + * Convert the fs's output color and mask to fit to the blending type. */ - boolean do_branch = ((key->depth.enabled || key->stencil[0].enabled) && - !key->alpha.enabled && - !shader->info.base.uses_kill); + for(chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) { + LLVMValueRef fs_color_vals[LP_MAX_VECTOR_LENGTH]; + + for (i = 0; i < num_fs; i++) { + fs_color_vals[i] = + LLVMBuildLoad(builder, fs_out_color[cbuf][chan][i], "fs_color_vals"); + } + + lp_build_conv(gallivm, fs_type, blend_type, + fs_color_vals, + num_fs, + &blend_in_color[chan], 1); + + lp_build_name(blend_in_color[chan], "color%d.%c", cbuf, "rgba"[chan]); + } + + if (partial_mask || !variant->opaque) { + lp_build_conv_mask(gallivm, fs_type, blend_type, + fs_mask, num_fs, + &blend_mask, 1); + } else { + blend_mask = lp_build_const_int_vec(gallivm, blend_type, ~0); + } - generate_blend(variant->gallivm, + generate_blend(gallivm, &key->blend, rt, builder, @@ -1221,6 +2235,7 @@ generate_variant(struct llvmpipe_context *lp, struct lp_fragment_shader_variant *variant; const struct util_format_description *cbuf0_format_desc; boolean fullcolormask; + unsigned i; variant = CALLOC_STRUCT(lp_fragment_shader_variant); if(!variant) @@ -1258,6 +2273,9 @@ generate_variant(struct llvmpipe_context *lp, !shader->info.base.uses_kill ? TRUE : FALSE; + for (i = 0; i < key->nr_cbufs; ++i) { + variant->unswizzled_cbufs |= llvmpipe_is_format_unswizzled(key->cbuf_format[i]) << i; + } if ((LP_DEBUG & DEBUG_FS) || (gallivm_debug & GALLIVM_DEBUG_IR)) { lp_debug_fs_variant(variant); diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.h b/src/gallium/drivers/llvmpipe/lp_state_fs.h index 306f5f9..173d2f4 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_fs.h +++ b/src/gallium/drivers/llvmpipe/lp_state_fs.h @@ -94,6 +94,9 @@ struct lp_fragment_shader_variant lp_jit_frag_func jit_function[2]; + /* Bitmask to say what cbufs are unswizzled */ + unsigned unswizzled_cbufs; + /* Total number of LLVM instructions generated */ unsigned nr_instrs; diff --git a/src/gallium/drivers/llvmpipe/lp_test_blend.c b/src/gallium/drivers/llvmpipe/lp_test_blend.c index 37b37fd..9ceb4a6 100644 --- a/src/gallium/drivers/llvmpipe/lp_test_blend.c +++ b/src/gallium/drivers/llvmpipe/lp_test_blend.c @@ -195,7 +195,7 @@ add_blend_test(struct gallivm_state *gallivm, dst = LLVMBuildLoad(builder, dst_ptr, "dst"); con = LLVMBuildLoad(builder, const_ptr, "const"); - res = lp_build_blend_aos(gallivm, blend, &format, type, rt, src, dst, NULL, con, swizzle); + res = lp_build_blend_aos(gallivm, blend, &format, type, rt, src, NULL, dst, NULL, con, NULL, swizzle, 4); lp_build_name(res, "res"); diff --git a/src/gallium/drivers/llvmpipe/lp_texture.c b/src/gallium/drivers/llvmpipe/lp_texture.c index f61e378..3d0acdf 100644 --- a/src/gallium/drivers/llvmpipe/lp_texture.c +++ b/src/gallium/drivers/llvmpipe/lp_texture.c @@ -756,6 +756,73 @@ llvmpipe_is_resource_referenced( struct pipe_context *pipe, return lp_setup_is_resource_referenced(llvmpipe->setup, presource); } +boolean +llvmpipe_is_format_unswizzled( enum pipe_format format ) +{ + const struct util_format_description *desc = util_format_description(format); + unsigned chan; + + if (format == PIPE_FORMAT_B8G8R8X8_UNORM || format == PIPE_FORMAT_B8G8R8A8_UNORM) { + return FALSE; + } + + if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN || + desc->colorspace != UTIL_FORMAT_COLORSPACE_RGB || + desc->block.width != 1 || + desc->block.height != 1) { + return FALSE; + } + + for (chan = 0; chan < desc->nr_channels; ++chan) { + if (desc->channel[chan].type == UTIL_FORMAT_TYPE_VOID && (chan + 1) == desc->nr_channels) + continue; + + if (desc->channel[chan].type != desc->channel[0].type) + return FALSE; + + if (desc->channel[chan].normalized != desc->channel[0].normalized) + return FALSE; + + if (desc->channel[chan].pure_integer != desc->channel[0].pure_integer) + return FALSE; + } + + /* All code assumes alpha is the last channel */ + if (desc->nr_channels == 4 && desc->swizzle[3] < 3) { + return FALSE; + } + + return TRUE; +} + + +/** + * Returns the largest possible alignment for a format in llvmpipe + */ +unsigned +llvmpipe_get_format_alignment( enum pipe_format format ) +{ + const struct util_format_description *desc = util_format_description(format); + unsigned size = 0; + unsigned bytes; + unsigned i; + + for (i = 0; i < desc->nr_channels; ++i) { + size += desc->channel[i].size; + } + + bytes = size / 8; + + if (!util_is_power_of_two(bytes)) { + bytes /= desc->nr_channels; + } + + if (bytes % 2 || bytes < 1) { + return 1; + } else { + return bytes; + } +} /** diff --git a/src/gallium/drivers/llvmpipe/lp_texture.h b/src/gallium/drivers/llvmpipe/lp_texture.h index 73eaddc..67b2540 100644 --- a/src/gallium/drivers/llvmpipe/lp_texture.h +++ b/src/gallium/drivers/llvmpipe/lp_texture.h @@ -256,4 +256,10 @@ llvmpipe_is_resource_referenced( struct pipe_context *pipe, struct pipe_resource *presource, unsigned level, int layer); +boolean +llvmpipe_is_format_unswizzled(enum pipe_format format); + +unsigned +llvmpipe_get_format_alignment(enum pipe_format format); + #endif /* LP_TEXTURE_H */ -- 2.7.4