From 37ee462e036b9b3bd90bc2b50fc4b05ac9a63560 Mon Sep 17 00:00:00 2001
From: Ian Romanick <ian.d.romanick@intel.com>
Date: Wed, 27 Feb 2019 19:52:12 -0800
Subject: [PATCH] nir/algebraic: Fix up extract_[iu]8 after loop unrolling
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

Skylake, Broadwell, and Haswell had similar results. (Skylake shown)
total instructions in shared programs: 15256840 -> 15256837 (<.01%)
instructions in affected programs: 4713 -> 4710 (-0.06%)
helped: 3
HURT: 0
helped stats (abs) min: 1 max: 1 xÌ: 1.00 xÌ: 1
helped stats (rel) min: 0.06% max: 0.08% xÌ: 0.06% xÌ: 0.06%

total cycles in shared programs: 372286583 -> 372286583 (0.00%)
cycles in affected programs: 198516 -> 198516 (0.00%)
helped: 1
HURT: 1
helped stats (abs) min: 10 max: 10 xÌ: 10.00 xÌ: 10
helped stats (rel) min: <.01% max: <.01% xÌ: <.01% xÌ: <.01%
HURT stats (abs)   min: 10 max: 10 xÌ: 10.00 xÌ: 10
HURT stats (rel)   min: 0.01% max: 0.01% xÌ: 0.01% xÌ: 0.01%

No changes on any other Intel platform.

v2: Use a loop to generate patterns.  Suggested by Jason.

Reviewed-by: Matt Turner <mattst88@gmail.com> [v1]
Reviewed-by: Dylan Baker <dylan@pnwbakers.com>
Acked-by: Jason Ekstrand <jason@jlekstrand.net>
---
 src/compiler/nir/nir_opt_algebraic.py | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py
index 5b2e7ee..ac6e5b9 100644
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@@ -614,8 +614,26 @@ optimizations = [
    (('ishr', 'a@32', 24), ('extract_i8', a, 3), '!options->lower_extract_byte'),
    (('iand', 0xff, ('ushr', a, 16)), ('extract_u8', a, 2), '!options->lower_extract_byte'),
    (('iand', 0xff, ('ushr', a,  8)), ('extract_u8', a, 1), '!options->lower_extract_byte'),
-   (('iand', 0xff, a), ('extract_u8', a, 0), '!options->lower_extract_byte'),
+   (('iand', 0xff, a), ('extract_u8', a, 0), '!options->lower_extract_byte')
+]
+
+# The ('extract_u8', a, 0) pattern, above, can trigger in cases where the
+# shift count is based on a loop induction variable.  Once the loop is
+# unrolled, constant folding will generate patterns like those below.
+for op in ('ushr', 'ishr'):
+   optimizations.extend([(('extract_u8', (op, 'a@16',  8),     0), ('extract_u8', a, 1))])
+   optimizations.extend([(('extract_u8', (op, 'a@32',  8 * i), 0), ('extract_u8', a, i)) for i in range(1, 4)])
+   optimizations.extend([(('extract_u8', (op, 'a@64',  8 * i), 0), ('extract_u8', a, i)) for i in range(1, 8)])
+
+optimizations.extend([(('extract_u8', ('extract_u16', a, 1), 0), ('extract_u8', a, 2))])
 
+# The ('extract_[iu]8', a, 3) patterns, above, can trigger in cases where the
+# shift count is based on a loop induction variable.  Once the loop is
+# unrolled, constant folding will generate patterns like those below.
+for op in ('extract_u8', 'extract_i8'):
+   optimizations.extend([((op, ('ishl', 'a@32', 24 - 8 * i), 3), ('extract_u8', a, i)) for i in range(2, -1, -1)])
+
+optimizations.extend([
     # Word extraction
    (('ushr', ('ishl', 'a@32', 16), 16), ('extract_u16', a, 0), '!options->lower_extract_word'),
    (('ushr', 'a@32', 16), ('extract_u16', a, 1), '!options->lower_extract_word'),
@@ -798,7 +816,7 @@ optimizations = [
      'options->lower_unpack_snorm_4x8'),
 
    (('isign', a), ('imin', ('imax', a, -1), 1), 'options->lower_isign'),
-]
+])
 
 # bit_size dependent lowerings
 for bit_size in [8, 16, 32, 64]:
-- 
2.7.4