From 78ee74de4a37df75c0745e15254753f1f9ab8aa8 Mon Sep 17 00:00:00 2001 From: Ian Romanick Date: Mon, 13 Mar 2023 19:47:07 -0700 Subject: [PATCH] intel/compiler: Micro optimize regions_overlap MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit On my Ice Lake laptop (using a locked CPU speed and other measures to prevent thermal throttling, etc.) using a release build, improves performance of compiling shaders from batman_arkham_city_goty.foz by -1.09% ± 0.084% (n = 5, pooled s = 0.354471) Reduces the size of a release build by 26k. text data bss dec hex filename 23163641 400720 231360 23795721 16b1809 before/lib64/dri/iris_dri.so 23137264 400720 231360 23769344 16ab100 after/lib64/dri/iris_dri.so Reviewed-by: Lionel Landwerlin Part-of: --- src/intel/compiler/brw_ir_fs.h | 48 ++++++++++++++++++++++++++++++------------ 1 file changed, 34 insertions(+), 14 deletions(-) diff --git a/src/intel/compiler/brw_ir_fs.h b/src/intel/compiler/brw_ir_fs.h index 2a696cc..06a2346 100644 --- a/src/intel/compiler/brw_ir_fs.h +++ b/src/intel/compiler/brw_ir_fs.h @@ -212,30 +212,50 @@ reg_padding(const fs_reg &r) return (MAX2(1, stride) - 1) * type_sz(r.type); } -/** - * Return whether the register region starting at \p r and spanning \p dr - * bytes could potentially overlap the register region starting at \p s and - * spanning \p ds bytes. - */ +/* Do not call this directly. Call regions_overlap() instead. */ static inline bool -regions_overlap(const fs_reg &r, unsigned dr, const fs_reg &s, unsigned ds) +regions_overlap_MRF(const fs_reg &r, unsigned dr, const fs_reg &s, unsigned ds) { - if (r.file == MRF && (r.nr & BRW_MRF_COMPR4)) { + if (r.nr & BRW_MRF_COMPR4) { fs_reg t = r; t.nr &= ~BRW_MRF_COMPR4; /* COMPR4 regions are translated by the hardware during decompression * into two separate half-regions 4 MRFs apart from each other. + * + * Note: swapping s and t in this parameter list eliminates one possible + * level of recursion (since the s in the called versions of + * regions_overlap_MRF can't be COMPR4), and that makes the compiled + * code a lot smaller. */ - return regions_overlap(t, dr / 2, s, ds) || - regions_overlap(byte_offset(t, 4 * REG_SIZE), dr / 2, s, ds); + return regions_overlap_MRF(s, ds, t, dr / 2) || + regions_overlap_MRF(s, ds, byte_offset(t, 4 * REG_SIZE), dr / 2); + } else if (s.nr & BRW_MRF_COMPR4) { + return regions_overlap_MRF(s, ds, r, dr); + } - } else if (s.file == MRF && (s.nr & BRW_MRF_COMPR4)) { - return regions_overlap(s, ds, r, dr); + return !((r.nr * REG_SIZE + r.offset + dr) <= (s.nr * REG_SIZE + s.offset) || + (s.nr * REG_SIZE + s.offset + ds) <= (r.nr * REG_SIZE + r.offset)); +} - } else { - return reg_space(r) == reg_space(s) && - !(reg_offset(r) + dr <= reg_offset(s) || +/** + * Return whether the register region starting at \p r and spanning \p dr + * bytes could potentially overlap the register region starting at \p s and + * spanning \p ds bytes. + */ +static inline bool +regions_overlap(const fs_reg &r, unsigned dr, const fs_reg &s, unsigned ds) +{ + if (r.file != s.file) + return false; + + if (r.file == VGRF) { + return r.nr == s.nr && + !(r.offset + dr <= s.offset || s.offset + ds <= r.offset); + } else if (r.file != MRF) { + return !(reg_offset(r) + dr <= reg_offset(s) || reg_offset(s) + ds <= reg_offset(r)); + } else { + return regions_overlap_MRF(r, dr, s, ds); } } -- 2.7.4