From 0079a375a58b288caacc2721f5a34b8f1233e7d1 Mon Sep 17 00:00:00 2001 From: Ben Skeggs Date: Fri, 9 May 2014 15:55:47 +1000 Subject: [PATCH] nvc0: allow for easier modification of compiler library routines Signed-off-by: Ben Skeggs Reviewed-by: Ilia Mirkin --- src/gallium/drivers/nouveau/codegen/lib/Makefile | 10 + src/gallium/drivers/nouveau/codegen/lib/gf100.asm | 107 ++++ .../drivers/nouveau/codegen/lib/gf100.asm.h | 63 +++ .../codegen/{target_lib_nve4.asm => lib/gk104.asm} | 163 +++--- .../drivers/nouveau/codegen/lib/gk104.asm.h | 598 +++++++++++++++++++++ src/gallium/drivers/nouveau/codegen/lib/gk110.asm | 98 ++++ .../drivers/nouveau/codegen/lib/gk110.asm.h | 81 +++ .../nouveau/codegen/nv50_ir_target_nvc0.cpp | 24 +- .../drivers/nouveau/codegen/target_lib_nvc0.asm | 96 ---- .../drivers/nouveau/codegen/target_lib_nvc0.asm.h | 112 ---- .../drivers/nouveau/codegen/target_lib_nve4.asm.h | 592 -------------------- .../drivers/nouveau/codegen/target_lib_nvf0.asm | 86 --- .../drivers/nouveau/codegen/target_lib_nvf0.asm.h | 84 --- 13 files changed, 1057 insertions(+), 1057 deletions(-) create mode 100644 src/gallium/drivers/nouveau/codegen/lib/Makefile create mode 100644 src/gallium/drivers/nouveau/codegen/lib/gf100.asm create mode 100644 src/gallium/drivers/nouveau/codegen/lib/gf100.asm.h rename src/gallium/drivers/nouveau/codegen/{target_lib_nve4.asm => lib/gk104.asm} (88%) create mode 100644 src/gallium/drivers/nouveau/codegen/lib/gk104.asm.h create mode 100644 src/gallium/drivers/nouveau/codegen/lib/gk110.asm create mode 100644 src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h delete mode 100644 src/gallium/drivers/nouveau/codegen/target_lib_nvc0.asm delete mode 100644 src/gallium/drivers/nouveau/codegen/target_lib_nvc0.asm.h delete mode 100644 src/gallium/drivers/nouveau/codegen/target_lib_nve4.asm.h delete mode 100644 src/gallium/drivers/nouveau/codegen/target_lib_nvf0.asm delete mode 100644 src/gallium/drivers/nouveau/codegen/target_lib_nvf0.asm.h diff --git a/src/gallium/drivers/nouveau/codegen/lib/Makefile b/src/gallium/drivers/nouveau/codegen/lib/Makefile new file mode 100644 index 0000000..28a41a3 --- /dev/null +++ b/src/gallium/drivers/nouveau/codegen/lib/Makefile @@ -0,0 +1,10 @@ +ENVYAS ?= envyas + +all: gf100.asm.h gk104.asm.h gk110.asm.h + +gf100.asm.h: %.asm.h: %.asm + $(ENVYAS) -a -W -mnvc0 -Vnvc0 $< -o $@ +gk104.asm.h: %.asm.h: %.asm + $(ENVYAS) -a -W -mnvc0 -Vnve4 $< -o $@ +gk110.asm.h: %.asm.h: %.asm + $(ENVYAS) -a -W -mgk110 $< -o $@ diff --git a/src/gallium/drivers/nouveau/codegen/lib/gf100.asm b/src/gallium/drivers/nouveau/codegen/lib/gf100.asm new file mode 100644 index 0000000..cf393b1 --- /dev/null +++ b/src/gallium/drivers/nouveau/codegen/lib/gf100.asm @@ -0,0 +1,107 @@ +.section #gf100_builtin_code +// DIV U32 +// +// UNR recurrence (q = a / b): +// look for z such that 2^32 - b <= b * z < 2^32 +// then q - 1 <= (a * z) / 2^32 <= q +// +// INPUT: $r0: dividend, $r1: divisor +// OUTPUT: $r0: result, $r1: modulus +// CLOBBER: $r2 - $r3, $p0 - $p1 +// SIZE: 22 / 14 * 8 bytes +// +gf100_div_u32: + bfind u32 $r2 $r1 + xor b32 $r2 $r2 0x1f + mov b32 $r3 0x1 + shl b32 $r2 $r3 clamp $r2 + cvt u32 $r1 neg u32 $r1 + mul $r3 u32 $r1 u32 $r2 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mul $r3 u32 $r1 u32 $r2 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mul $r3 u32 $r1 u32 $r2 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mul $r3 u32 $r1 u32 $r2 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mul $r3 u32 $r1 u32 $r2 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mov b32 $r3 $r0 + mul high $r0 u32 $r0 u32 $r2 + cvt u32 $r2 neg u32 $r1 + add $r1 (mul u32 $r1 u32 $r0) $r3 + set $p0 0x1 ge u32 $r1 $r2 + $p0 sub b32 $r1 $r1 $r2 + $p0 add b32 $r0 $r0 0x1 + $p0 set $p0 0x1 ge u32 $r1 $r2 + $p0 sub b32 $r1 $r1 $r2 + $p0 add b32 $r0 $r0 0x1 + ret + +// DIV S32, like DIV U32 after taking ABS(inputs) +// +// INPUT: $r0: dividend, $r1: divisor +// OUTPUT: $r0: result, $r1: modulus +// CLOBBER: $r2 - $r3, $p0 - $p3 +// +gf100_div_s32: + set $p2 0x1 lt s32 $r0 0x0 + set $p3 0x1 lt s32 $r1 0x0 xor $p2 + cvt s32 $r0 abs s32 $r0 + cvt s32 $r1 abs s32 $r1 + bfind u32 $r2 $r1 + xor b32 $r2 $r2 0x1f + mov b32 $r3 0x1 + shl b32 $r2 $r3 clamp $r2 + cvt u32 $r1 neg u32 $r1 + mul $r3 u32 $r1 u32 $r2 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mul $r3 u32 $r1 u32 $r2 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mul $r3 u32 $r1 u32 $r2 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mul $r3 u32 $r1 u32 $r2 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mul $r3 u32 $r1 u32 $r2 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mov b32 $r3 $r0 + mul high $r0 u32 $r0 u32 $r2 + cvt u32 $r2 neg u32 $r1 + add $r1 (mul u32 $r1 u32 $r0) $r3 + set $p0 0x1 ge u32 $r1 $r2 + $p0 sub b32 $r1 $r1 $r2 + $p0 add b32 $r0 $r0 0x1 + $p0 set $p0 0x1 ge u32 $r1 $r2 + $p0 sub b32 $r1 $r1 $r2 + $p0 add b32 $r0 $r0 0x1 + $p3 cvt s32 $r0 neg s32 $r0 + $p2 cvt s32 $r1 neg s32 $r1 + ret + +// RCP F64: Newton Raphson reciprocal(x): r_{i+1} = r_i * (2.0 - x * r_i) +// +// INPUT: $r0d (x) +// OUTPUT: $r0d (rcp(x)) +// CLOBBER: $r2 - $r7 +// SIZE: 9 * 8 bytes +// +gf100_rcp_f64: + nop + ret + +// RSQ F64: Newton Raphson rsqrt(x): r_{i+1} = r_i * (1.5 - 0.5 * x * r_i * r_i) +// +// INPUT: $r0d (x) +// OUTPUT: $r0d (rsqrt(x)) +// CLOBBER: $r2 - $r7 +// SIZE: 14 * 8 bytes +// +gf100_rsq_f64: + nop + ret + +.section #gf100_builtin_offsets +.b64 #gf100_div_u32 +.b64 #gf100_div_s32 +.b64 #gf100_rcp_f64 +.b64 #gf100_rsq_f64 diff --git a/src/gallium/drivers/nouveau/codegen/lib/gf100.asm.h b/src/gallium/drivers/nouveau/codegen/lib/gf100.asm.h new file mode 100644 index 0000000..00fe5ea --- /dev/null +++ b/src/gallium/drivers/nouveau/codegen/lib/gf100.asm.h @@ -0,0 +1,63 @@ +uint64_t gf100_builtin_code[] = { +/* 0x0000: gf100_div_u32 */ + 0x7800000004009c03, + 0x0010dd187c209cdd, + 0x6000000008309c03, + 0x0810dc2a05605c18, + 0x200400000c209c43, + 0x500000000810dc03, + 0x200400000c209c43, + 0x500000000810dc03, + 0x200400000c209c43, + 0x500000000810dc03, + 0x200400000c209c43, + 0x500000000810dc03, + 0x200400000c209c43, + 0x280000000000dde4, + 0x5000000008001c43, + 0x0010430d05609c18, + 0x1b0e00000811dc03, + 0x4800000008104103, + 0x0800000004000002, + 0x1b0e00000811c003, + 0x4800000008104103, + 0x90001dff040000ac, +/* 0x00b0: gf100_div_s32 */ + 0x188e0000fc05dc23, + 0x18c40000fc17dc23, + 0x07305e1803301e18, + 0x7800000004009c03, + 0x0010dd187c209cdd, + 0x6000000008309c03, + 0x0810dc2a05605c18, + 0x200400000c209c43, + 0x500000000810dc03, + 0x200400000c209c43, + 0x500000000810dc03, + 0x200400000c209c43, + 0x500000000810dc03, + 0x200400000c209c43, + 0x500000000810dc03, + 0x200400000c209c43, + 0x280000000000dde4, + 0x5000000008001c43, + 0x0010430d05609c18, + 0x1b0e00000811dc03, + 0x4800000008104103, + 0x0800000004000002, + 0x1b0e00000811c003, + 0x4800000008104103, + 0x01700e18040000ac, + 0x90001dff05704a18, +/* 0x0180: gf100_rcp_f64 */ + 0x90001dff00001c08, +/* 0x0188: gf100_rsq_f64 */ + 0x90001dff00001c08, +}; + +uint64_t gf100_builtin_offsets[] = { + 0x0000000000000000, + 0x00000000000000b0, + 0x0000000000000180, + 0x0000000000000188, +}; diff --git a/src/gallium/drivers/nouveau/codegen/target_lib_nve4.asm b/src/gallium/drivers/nouveau/codegen/lib/gk104.asm similarity index 88% rename from src/gallium/drivers/nouveau/codegen/target_lib_nve4.asm rename to src/gallium/drivers/nouveau/codegen/lib/gk104.asm index 5adc9ff..cd65b54 100644 --- a/src/gallium/drivers/nouveau/codegen/target_lib_nve4.asm +++ b/src/gallium/drivers/nouveau/codegen/lib/gk104.asm @@ -1,4 +1,4 @@ -// +.section #gk104_builtin_code // DIV U32 // // UNR recurrence (q = a / b): @@ -10,81 +10,83 @@ // CLOBBER: $r2 - $r3, $p0 - $p1 // SIZE: 22 / 14 * 8 bytes // -sched 0x28 0x4 0x28 0x4 0x28 0x28 0x28 -bfind u32 $r2 $r1 -long xor b32 $r2 $r2 0x1f -long mov b32 $r3 0x1 -shl b32 $r2 $r3 clamp $r2 -long cvt u32 $r1 neg u32 $r1 -long mul $r3 u32 $r1 u32 $r2 -add $r2 (mul high u32 $r2 u32 $r3) $r2 -sched 0x28 0x28 0x28 0x28 0x28 0x28 0x28 -mul $r3 u32 $r1 u32 $r2 -add $r2 (mul high u32 $r2 u32 $r3) $r2 -mul $r3 u32 $r1 u32 $r2 -add $r2 (mul high u32 $r2 u32 $r3) $r2 -mul $r3 u32 $r1 u32 $r2 -add $r2 (mul high u32 $r2 u32 $r3) $r2 -mul $r3 u32 $r1 u32 $r2 -sched 0x4 0x28 0x4 0x28 0x28 0x2c 0x4 -add $r2 (mul high u32 $r2 u32 $r3) $r2 -mov b32 $r3 $r0 -mul high $r0 u32 $r0 u32 $r2 -long cvt u32 $r2 neg u32 $r1 -long add $r1 (mul u32 $r1 u32 $r0) $r3 -set $p0 0x1 ge u32 $r1 $r2 -$p0 sub b32 $r1 $r1 $r2 -sched 0x28 0x2c 0x4 0x20 0x2e 0x28 0x20 -$p0 add b32 $r0 $r0 0x1 -$p0 set $p0 0x1 ge u32 $r1 $r2 -$p0 sub b32 $r1 $r1 $r2 -$p0 add b32 $r0 $r0 0x1 -long ret -// +gk104_div_u32: + sched 0x28 0x4 0x28 0x4 0x28 0x28 0x28 + bfind u32 $r2 $r1 + long xor b32 $r2 $r2 0x1f + long mov b32 $r3 0x1 + shl b32 $r2 $r3 clamp $r2 + long cvt u32 $r1 neg u32 $r1 + long mul $r3 u32 $r1 u32 $r2 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + sched 0x28 0x28 0x28 0x28 0x28 0x28 0x28 + mul $r3 u32 $r1 u32 $r2 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mul $r3 u32 $r1 u32 $r2 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mul $r3 u32 $r1 u32 $r2 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mul $r3 u32 $r1 u32 $r2 + sched 0x4 0x28 0x4 0x28 0x28 0x2c 0x4 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mov b32 $r3 $r0 + mul high $r0 u32 $r0 u32 $r2 + long cvt u32 $r2 neg u32 $r1 + long add $r1 (mul u32 $r1 u32 $r0) $r3 + set $p0 0x1 ge u32 $r1 $r2 + $p0 sub b32 $r1 $r1 $r2 + sched 0x28 0x2c 0x4 0x20 0x2e 0x28 0x20 + $p0 add b32 $r0 $r0 0x1 + $p0 set $p0 0x1 ge u32 $r1 $r2 + $p0 sub b32 $r1 $r1 $r2 + $p0 add b32 $r0 $r0 0x1 + long ret + // DIV S32, like DIV U32 after taking ABS(inputs) // // INPUT: $r0: dividend, $r1: divisor // OUTPUT: $r0: result, $r1: modulus // CLOBBER: $r2 - $r3, $p0 - $p3 // -set $p2 0x1 lt s32 $r0 0x0 -set $p3 0x1 lt s32 $r1 0x0 xor $p2 -sched 0x20 0x28 0x28 0x4 0x28 0x04 0x28 -long cvt s32 $r0 abs s32 $r0 -long cvt s32 $r1 abs s32 $r1 -bfind u32 $r2 $r1 -long xor b32 $r2 $r2 0x1f -long mov b32 $r3 0x1 -shl b32 $r2 $r3 clamp $r2 -cvt u32 $r1 neg u32 $r1 -sched 0x28 0x28 0x28 0x28 0x28 0x28 0x28 -mul $r3 u32 $r1 u32 $r2 -add $r2 (mul high u32 $r2 u32 $r3) $r2 -mul $r3 u32 $r1 u32 $r2 -add $r2 (mul high u32 $r2 u32 $r3) $r2 -mul $r3 u32 $r1 u32 $r2 -add $r2 (mul high u32 $r2 u32 $r3) $r2 -mul $r3 u32 $r1 u32 $r2 -sched 0x28 0x28 0x4 0x28 0x04 0x28 0x28 -add $r2 (mul high u32 $r2 u32 $r3) $r2 -mul $r3 u32 $r1 u32 $r2 -add $r2 (mul high u32 $r2 u32 $r3) $r2 -mov b32 $r3 $r0 -mul high $r0 u32 $r0 u32 $r2 -long cvt u32 $r2 neg u32 $r1 -long add $r1 (mul u32 $r1 u32 $r0) $r3 -sched 0x2c 0x04 0x28 0x2c 0x04 0x28 0x20 -set $p0 0x1 ge u32 $r1 $r2 -$p0 sub b32 $r1 $r1 $r2 -$p0 add b32 $r0 $r0 0x1 -$p0 set $p0 0x1 ge u32 $r1 $r2 -$p0 sub b32 $r1 $r1 $r2 -long $p0 add b32 $r0 $r0 0x1 -long $p3 cvt s32 $r0 neg s32 $r0 -sched 0x04 0x2e 0x04 0x28 0x04 0x20 0x2c -$p2 cvt s32 $r1 neg s32 $r1 -long ret -// +gk104_div_s32: + set $p2 0x1 lt s32 $r0 0x0 + set $p3 0x1 lt s32 $r1 0x0 xor $p2 + sched 0x20 0x28 0x28 0x4 0x28 0x04 0x28 + long cvt s32 $r0 abs s32 $r0 + long cvt s32 $r1 abs s32 $r1 + bfind u32 $r2 $r1 + long xor b32 $r2 $r2 0x1f + long mov b32 $r3 0x1 + shl b32 $r2 $r3 clamp $r2 + cvt u32 $r1 neg u32 $r1 + sched 0x28 0x28 0x28 0x28 0x28 0x28 0x28 + mul $r3 u32 $r1 u32 $r2 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mul $r3 u32 $r1 u32 $r2 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mul $r3 u32 $r1 u32 $r2 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mul $r3 u32 $r1 u32 $r2 + sched 0x28 0x28 0x4 0x28 0x04 0x28 0x28 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mul $r3 u32 $r1 u32 $r2 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mov b32 $r3 $r0 + mul high $r0 u32 $r0 u32 $r2 + long cvt u32 $r2 neg u32 $r1 + long add $r1 (mul u32 $r1 u32 $r0) $r3 + sched 0x2c 0x04 0x28 0x2c 0x04 0x28 0x20 + set $p0 0x1 ge u32 $r1 $r2 + $p0 sub b32 $r1 $r1 $r2 + $p0 add b32 $r0 $r0 0x1 + $p0 set $p0 0x1 ge u32 $r1 $r2 + $p0 sub b32 $r1 $r1 $r2 + long $p0 add b32 $r0 $r0 0x1 + long $p3 cvt s32 $r0 neg s32 $r0 + sched 0x04 0x2e 0x04 0x28 0x04 0x20 0x2c + $p2 cvt s32 $r1 neg s32 $r1 + long ret + // SULDP [for each format] // $r4d: address // $r2: surface info (format) @@ -542,7 +544,8 @@ $p1 suldgb b32 $r3 cv zero u8 g[$r4d] $r2 $p0 long mov b32 $r3 0x3f800000 long nop long ret -// + + // RCP F64: Newton Raphson reciprocal(x): r_{i+1} = r_i * (2.0 - x * r_i) // // INPUT: $r0d (x) @@ -550,8 +553,10 @@ long ret // CLOBBER: $r2 - $r7 // SIZE: 9 * 8 bytes // -long nop -long ret +gk104_rcp_f64: + long nop + long ret + // RSQ F64: Newton Raphson rsqrt(x): r_{i+1} = r_i * (1.5 - 0.5 * x * r_i * r_i) // // INPUT: $r0d (x) @@ -559,8 +564,10 @@ long ret // CLOBBER: $r2 - $r7 // SIZE: 14 * 8 bytes // -long nop -long ret +gk104_rsq_f64: + long nop + long ret + // // Trap handler. // Requires at least 4 GPRs and 32 bytes of l[] memory to temporarily save GPRs. @@ -696,3 +703,9 @@ bpt pause 0x0 mov $flags $r2 mask 0xffff ld b128 $r0q cs l[0x00] rtt + +.section #gk104_builtin_offsets +.b64 #gk104_div_u32 +.b64 #gk104_div_s32 +.b64 #gk104_rcp_f64 +.b64 #gk104_rsq_f64 diff --git a/src/gallium/drivers/nouveau/codegen/lib/gk104.asm.h b/src/gallium/drivers/nouveau/codegen/lib/gk104.asm.h new file mode 100644 index 0000000..3799876 --- /dev/null +++ b/src/gallium/drivers/nouveau/codegen/lib/gk104.asm.h @@ -0,0 +1,598 @@ +uint64_t gk104_builtin_code[] = { +/* 0x0000: gk104_div_u32 */ + 0x2282828042804287, + 0x7800000004009c03, + 0x380000007c209c82, + 0x180000000400dde2, + 0x6000000008309c03, + 0x1c00000005205d04, + 0x500000000810dc03, + 0x200400000c209c43, + 0x2282828282828287, + 0x500000000810dc03, + 0x200400000c209c43, + 0x500000000810dc03, + 0x200400000c209c43, + 0x500000000810dc03, + 0x200400000c209c43, + 0x500000000810dc03, + 0x2042c28280428047, + 0x200400000c209c43, + 0x280000000000dde4, + 0x5000000008001c43, + 0x1c00000005209d04, + 0x2006000000105c03, + 0x1b0e00000811dc03, + 0x4800000008104103, + 0x220282e20042c287, + 0x0800000004000002, + 0x1b0e00000811c003, + 0x4800000008104103, + 0x0800000004000002, + 0x9000000000001de7, +/* 0x00f0: gk104_div_s32 */ + 0x188e0000fc05dc23, + 0x18c40000fc17dc23, + 0x2280428042828207, + 0x1c00000001201ec4, + 0x1c00000005205ec4, + 0x7800000004009c03, + 0x380000007c209c82, + 0x180000000400dde2, + 0x6000000008309c03, + 0x1c00000005205d04, + 0x2282828282828287, + 0x500000000810dc03, + 0x200400000c209c43, + 0x500000000810dc03, + 0x200400000c209c43, + 0x500000000810dc03, + 0x200400000c209c43, + 0x500000000810dc03, + 0x2282804280428287, + 0x200400000c209c43, + 0x500000000810dc03, + 0x200400000c209c43, + 0x280000000000dde4, + 0x5000000008001c43, + 0x1c00000005209d04, + 0x2006000000105c03, + 0x22028042c28042c7, + 0x1b0e00000811dc03, + 0x4800000008104103, + 0x0800000004000002, + 0x1b0e00000811c003, + 0x4800000008104103, + 0x0800000004000002, + 0x1c00000001200f84, + 0x22c200428042e047, + 0x1c00000005204b84, + 0x9000000000001de7, + 0xd4004000084004c5, + 0x0c5400000013dc04, + 0xd4004000084009c5, + 0xd4004000084007c5, + 0x9000000000001de7, + 0x2000000000000007, + 0xd4004000084004c5, + 0x0c5400000013dc04, + 0xd4004000084009c5, + 0xd4004000084007c5, + 0x1900000004a0dc04, + 0x1800000004a09c04, + 0x30de0001d030dc02, + 0x2000000000000007, + 0x1900000000a05c04, + 0x30de0001d0209c02, + 0x1800000000a01c04, + 0x30de0001d0105c02, + 0x30de0001d0001c02, + 0x9000000000001de7, + 0xd4004000084004a5, + 0x2000000000000007, + 0x0c5400000013dc04, + 0xd4004000084009a5, + 0xd4004000084007a5, + 0x1900000004a0de04, + 0x1800000004a09e04, + 0x30e000061c30dc02, + 0x1900000000a05e04, + 0x2000000000000007, + 0x30e000061c209c02, + 0x1800000000a01e04, + 0x30e000061c105c02, + 0x30e000061c001c02, + 0x9000000000001de7, + 0xd4004000084004a5, + 0x0c5400000013dc04, + 0x2000000000000007, + 0xd4004000084009a5, + 0xd4004000084007a5, + 0x1d00000004a0de84, + 0x1c00000004a09e84, + 0x1d00000000a05e84, + 0x1c00000000a01e84, + 0x9000000000001de7, + 0x2000000000000007, + 0xd4004000084004a5, + 0x0c5400000013dc04, + 0xd4004000084009a5, + 0xd4004000084007a5, + 0x1d00000004a0dc04, + 0x1c00000004a09c04, + 0x1d00000000a05c04, + 0x2000000000000007, + 0x1c00000000a01c04, + 0x9000000000001de7, + 0xd4004000084004a5, + 0x0c5400000013dc04, + 0xd4004000084009a5, + 0xd4004000084007a5, + 0x1100000004a0dc04, + 0x2000000000000007, + 0x1000000004a09c04, + 0x1100000000a05c04, + 0x1000000000a01c04, + 0x9000000000001de7, + 0xd4004000084004a5, + 0x0c5400000013dc04, + 0xd4004000084009a5, + 0x2000000000000007, + 0xd4004000084007a5, + 0x1800000000009de2, + 0x18fe00000000dde2, + 0x9000000000001de7, + 0xd4004000084004a5, + 0x0c5400000013dc04, + 0xd4004000084009a5, + 0x2000000000000007, + 0xd4004000084007a5, + 0x1800000000009de2, + 0x180000000400dde2, + 0x9000000000001de7, + 0xd400400008400485, + 0x0c5400000013dc04, + 0xd400400008400985, + 0x2000000000000007, + 0xd400400008400785, + 0x7000c02828005c03, + 0x18fe00000000dde2, + 0x7000c02850009c03, + 0x3800000ffc001c02, + 0x1800000008a09c04, + 0x1800000004a05c04, + 0x2000000000000007, + 0x30ea00801c209c02, + 0x1800000000a01c04, + 0x30ea00801c105c02, + 0x30ea00801c001c02, + 0x9000000000001de7, + 0xd400400008400485, + 0x0c5400000013dc04, + 0x2000000000000007, + 0xd400400008400985, + 0xd400400008400785, + 0x7000c02828005c03, + 0x180000000400dde2, + 0x7000c02850009c03, + 0x3800000ffc001c02, + 0x9000000000001de7, + 0x2000000000000007, + 0xd400400008400485, + 0x0c5400000013dc04, + 0xd400400008400985, + 0xd400400008400785, + 0x198000000020dc04, + 0x1900000000209c04, + 0x30ee02020430dc02, + 0x2000000000000007, + 0x1880000000205c04, + 0x30ee020204209c02, + 0x1800000000201c04, + 0x30ee020204105c02, + 0x30ee020204001c02, + 0x9000000000001de7, + 0xd400400008400485, + 0x2000000000000007, + 0x0c5400000013dc04, + 0xd400400008400985, + 0xd400400008400785, + 0x198000000020de04, + 0x1900000000209e04, + 0x30f004081030dc02, + 0x1880000000205e04, + 0x2000000000000007, + 0x30f0040810209c02, + 0x1800000000201e04, + 0x30f0040810105c02, + 0x30f0040810001c02, + 0x9000000000001de7, + 0xd400400008400485, + 0x0c5400000013dc04, + 0x2000000000000007, + 0xd400400008400985, + 0xd400400008400785, + 0x1d8000000020de84, + 0x1d00000000209e84, + 0x1c80000000205e84, + 0x1c00000000201e84, + 0x9000000000001de7, + 0x2000000000000007, + 0xd400400008400485, + 0x0c5400000013dc04, + 0xd400400008400985, + 0xd400400008400785, + 0x1d8000000020dc04, + 0x1d00000000209c04, + 0x1c80000000205c04, + 0x2000000000000007, + 0x1c00000000201c04, + 0x9000000000001de7, + 0xd400400008400445, + 0x0c5400000013dc04, + 0xd400400008400945, + 0xd400400008400745, + 0x7000c01814005c03, + 0x2000000000000007, + 0x18fe00000000dde2, + 0x7000c0142c009c03, + 0x380000007c001c02, + 0x1800000008209c04, + 0x1800000004205c04, + 0x30f4108420209c02, + 0x1800000000201c04, + 0x2000000000000007, + 0x30f2082084105c02, + 0x30f4108420001c02, + 0x9000000000001de7, + 0xd400400008400445, + 0x0c5400000013dc04, + 0xd400400008400945, + 0xd400400008400745, + 0x2000000000000007, + 0x7000c01414005c03, + 0x7000c01428009c03, + 0x380000007c001c02, + 0x18fe00000000dde2, + 0x1800000008209c04, + 0x1800000004205c04, + 0x1800000000201c04, + 0x2000000000000007, + 0x30f4108420209c02, + 0x30f4108420105c02, + 0x30f4108420001c02, + 0x9000000000001de7, + 0xd400400008400485, + 0x0c5400000013dc04, + 0xd400400008400985, + 0x2000000000000007, + 0xd400400008400785, + 0x1900000000a05c04, + 0x1800000000a01c04, + 0x30de0001d0105c02, + 0x30de0001d0001c02, + 0x1800000000009de2, + 0x18fe00000000dde2, + 0x2000000000000007, + 0x9000000000001de7, + 0xd400400008400485, + 0x0c5400000013dc04, + 0xd400400008400985, + 0xd400400008400785, + 0x18fe00000000dde2, + 0x1900000000a05e04, + 0x2000000000000007, + 0x1800000000009de2, + 0x1800000000a01e04, + 0x30e000061c105c02, + 0x30e000061c001c02, + 0x9000000000001de7, + 0xd400400008400485, + 0x0c5400000013dc04, + 0x2000000000000007, + 0xd400400008400985, + 0xd400400008400785, + 0x180000000400dde2, + 0x1d00000000a05e84, + 0x1800000000009de2, + 0x1c00000000a01e84, + 0x9000000000001de7, + 0x2000000000000007, + 0xd400400008400485, + 0x0c5400000013dc04, + 0xd400400008400985, + 0xd400400008400785, + 0x180000000400dde2, + 0x1d00000000a05c04, + 0x1800000000009de2, + 0x2000000000000007, + 0x1c00000000a01c04, + 0x9000000000001de7, + 0xd400400008400485, + 0x0c5400000013dc04, + 0xd400400008400985, + 0xd400400008400785, + 0x18fe00000000dde2, + 0x2000000000000007, + 0x1100000000a05c04, + 0x1800000000009de2, + 0x1000000000a01c04, + 0x9000000000001de7, + 0xd400400008400485, + 0x0c5400000013dc04, + 0xd400400008400985, + 0x2000000000000007, + 0xd400400008400785, + 0x18fe00000000dde2, + 0x1800000000009de2, + 0x1800000000005de2, + 0x9000000000001de7, + 0xd400400008400485, + 0x0c5400000013dc04, + 0x2000000000000007, + 0xd400400008400985, + 0xd400400008400785, + 0x180000000400dde2, + 0x1800000000009de2, + 0x1800000000005de2, + 0x9000000000001de7, + 0xd400400008400445, + 0x2000000000000007, + 0x0c5400000013dc04, + 0xd400400008400945, + 0xd400400008400745, + 0x18fe00000000dde2, + 0x1880000000205c04, + 0x1800000000009de2, + 0x1800000000201c04, + 0x2000000000000007, + 0x30ee020204105c02, + 0x30ee020204001c02, + 0x9000000000001de7, + 0xd400400008400445, + 0x0c5400000013dc04, + 0xd400400008400945, + 0xd400400008400745, + 0x2000000000000007, + 0x18fe00000000dde2, + 0x1880000000205e04, + 0x1800000000009de2, + 0x1800000000201e04, + 0x30f0040810105c02, + 0x30f0040810001c02, + 0x9000000000001de7, + 0x2000000000000007, + 0xd400400008400445, + 0x0c5400000013dc04, + 0xd400400008400945, + 0xd400400008400745, + 0x180000000400dde2, + 0x1c80000000205c04, + 0x1800000000009de2, + 0x2000000000000007, + 0x1c00000000201c04, + 0x9000000000001de7, + 0xd400400008400445, + 0x0c5400000013dc04, + 0xd400400008400945, + 0xd400400008400745, + 0x180000000400dde2, + 0x2000000000000007, + 0x1c80000000205e84, + 0x1800000000009de2, + 0x1c00000000201e84, + 0x9000000000001de7, + 0xd400400008400445, + 0x0c5400000013dc04, + 0xd400400008400945, + 0x2000000000000007, + 0xd400400008400745, + 0x18fe00000000dde2, + 0x1800000000a01c04, + 0x1800000000009de2, + 0x1800000000005de2, + 0x30de0001d0001c02, + 0x9000000000001de7, + 0x2000000000000007, + 0xd400400008400445, + 0x0c5400000013dc04, + 0xd400400008400945, + 0xd400400008400745, + 0x18fe00000000dde2, + 0x1800000000a01e04, + 0x1800000000009de2, + 0x2000000000000007, + 0x1800000000005de2, + 0x30e000061c001c02, + 0x9000000000001de7, + 0xd400400008400465, + 0x0c5400000013dc04, + 0xd400400008400965, + 0xd400400008400765, + 0x2000000000000007, + 0x180000000400dde2, + 0x1800000000009de2, + 0x1800000000005de2, + 0x9000000000001de7, + 0xd400400008400445, + 0x0c5400000013dc04, + 0xd400400008400945, + 0x2000000000000007, + 0xd400400008400745, + 0x180000000400dde2, + 0x1800000000009de2, + 0x1800000000005de2, + 0x9000000000001de7, + 0xd400400008400445, + 0x0c5400000013dc04, + 0x2000000000000007, + 0xd400400008400945, + 0xd400400008400745, + 0x18fe00000000dde2, + 0x1800000000009de2, + 0x1000000000a01c04, + 0x1800000000005de2, + 0x9000000000001de7, + 0x2000000000000007, + 0xd400400008400405, + 0x0c5400000013dc04, + 0xd400400008400905, + 0xd400400008400705, + 0x18fe00000000dde2, + 0x1800000000201c04, + 0x1800000000009de2, + 0x2000000000000007, + 0x30ee020204001c02, + 0x1800000000005de2, + 0x9000000000001de7, + 0xd400400008400405, + 0x0c5400000013dc04, + 0xd400400008400905, + 0xd400400008400705, + 0x2000000000000007, + 0x18fe00000000dde2, + 0x1800000000201e04, + 0x1800000000009de2, + 0x30f0040810001c02, + 0x1800000000005de2, + 0x9000000000001de7, + 0xd400400008400425, + 0x2000000000000007, + 0x0c5400000013dc04, + 0xd400400008400925, + 0xd400400008400725, + 0x180000000400dde2, + 0x1800000000009de2, + 0x1800000000005de2, + 0x9000000000001de7, + 0x2000000000000007, + 0xd400400008400405, + 0x0c5400000013dc04, + 0xd400400008400905, + 0xd400400008400705, + 0x180000000400dde2, + 0x1800000000009de2, + 0x1800000000005de2, + 0x2000000000000007, + 0x9000000000001de7, + 0xd40040000840c485, + 0x0c5400000013dc04, + 0xd40040000840c985, + 0xd40040000840c785, + 0x18fe00000000dde2, + 0x4000000000001de4, + 0x9000000000001de7, +/* 0x0f08: gk104_rcp_f64 */ + 0x4000000000001de4, + 0x9000000000001de7, +/* 0x0f18: gk104_rsq_f64 */ + 0x4000000000001de4, + 0x9000000000001de7, + 0xc800000003f01cc5, + 0x2c00000100005c04, + 0x2c0000010800dc04, + 0x3000c3fffff09c04, + 0x680100000c1fdc03, + 0x4000000a60001c47, + 0x180000004000dde2, +/* 0x0f60: spill_cfstack */ + 0x78000009c0000007, + 0x0c0000000430dd02, + 0x4003ffffa0001ca7, + 0x2800406400001de4, + 0x2800406410005de4, + 0x180000000400dde2, + 0x547e18000000dd05, + 0x60000008e0000007, + 0x190ec0000431dc03, + 0x40000000000001f4, + 0x94000004c0009c85, + 0x2c00000100009c04, + 0x2c0000010800dc04, + 0x9400000020009ca5, + 0x9400000100011cc5, + 0x9400000140021cc5, + 0x9400000180031cc5, + 0x94000001c0041cc5, + 0x9400000200051cc5, + 0x9400000240061cc5, + 0x9400000280071cc5, + 0x94000002c0081cc5, + 0x9400000300091cc5, + 0x94000003400a1cc5, + 0x94000003800b1cc5, + 0x94000003c00c1cc5, + 0x94000004000d1cc5, + 0x94000004400e1cc5, + 0x94000004800f1cc5, + 0xc000000003f09ea5, + 0x94000000c0009ca5, + 0xc000000023f09ea5, + 0x94000000e0009ca5, + 0x2c00000084009c04, + 0x2c0000008800dc04, + 0x9400000040009ca5, + 0x2c0000008c009c04, + 0x2c0000009400dc04, + 0x9400000060009ca5, + 0x2c00000098009c04, + 0x2c0000009c00dc04, + 0x9400000080009ca5, + 0x2c000000c800dc04, + 0x0c0000001030dd02, + 0x4000000100001ea7, + 0x480100000c001c03, + 0x0800000000105c42, +/* 0x10d8: shared_loop */ + 0xc100000000309c85, + 0x9400000500009c85, + 0x0c00000010001d02, + 0x0800000000105d42, + 0x0c0000001030dd02, + 0x4003ffff40001ca7, +/* 0x1108: shared_done */ + 0x2800406420001de4, + 0x2800406430005de4, + 0xe000000000001c45, + 0xd000000003ffdcc5, + 0x9c000000000fdcc5, + 0x2c0000000c009c04, + 0x7000c0205020dc03, + 0x7000c01820209c03, + 0x5000406450209c03, + 0x500040644030dc03, + 0x480000000c209c03, + 0x4801000008001c03, + 0x0800000000105c42, +/* 0x1170: search_cstack */ + 0x280040646000dde4, + 0x8400000020009f05, + 0x190ec0002821dc03, + 0x40000000800001e7, + 0x0c00000040001c02, + 0x0800000000105c42, + 0x0c0000004030dd02, + 0x00029dff0ffc5cbf, +/* 0x11b0: entry_found */ + 0x8400000000009f85, + 0x2800406400001de4, + 0x2800406410005de4, + 0x9400000010009c85, + 0x4000000000001df4, +/* 0x11d8: end_exit */ + 0x9800000003ffdcc5, + 0xd000000000008007, + 0xa000000000004007, +/* 0x11f0: end_cont */ + 0xd000000000008007, + 0x3400c3fffc201c04, + 0xc000000003f01ec5, + 0xa000000000000007, +}; + +uint64_t gk104_builtin_offsets[] = { + 0x0000000000000000, + 0x00000000000000f0, + 0x0000000000000f08, + 0x0000000000000f18, +}; diff --git a/src/gallium/drivers/nouveau/codegen/lib/gk110.asm b/src/gallium/drivers/nouveau/codegen/lib/gk110.asm new file mode 100644 index 0000000..be17871 --- /dev/null +++ b/src/gallium/drivers/nouveau/codegen/lib/gk110.asm @@ -0,0 +1,98 @@ +.section #gk110_builtin_code +// DIV U32 +// +// UNR recurrence (q = a / b): +// look for z such that 2^32 - b <= b * z < 2^32 +// then q - 1 <= (a * z) / 2^32 <= q +// +// INPUT: $r0: dividend, $r1: divisor +// OUTPUT: $r0: result, $r1: modulus +// CLOBBER: $r2 - $r3, $p0 - $p1 +// SIZE: 22 / 14 * 8 bytes +// +gk110_div_u32: + sched 0x28282804280428 + bfind u32 $r2 $r1 + xor b32 $r2 $r2 0x1f + mov b32 $r3 0x1 + shl b32 $r2 $r3 clamp $r2 + cvt u32 $r1 neg u32 $r1 + mul $r3 u32 $r1 u32 $r2 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + sched 0x28282828282828 + mul $r3 u32 $r1 u32 $r2 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mul $r3 u32 $r1 u32 $r2 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mul $r3 u32 $r1 u32 $r2 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mul $r3 u32 $r1 u32 $r2 + sched 0x042c2828042804 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mov b32 $r3 $r0 + mul high $r0 u32 $r0 u32 $r2 + cvt u32 $r2 neg u32 $r1 + add $r1 (mul u32 $r1 u32 $r0) $r3 + set $p0 0x1 ge u32 $r1 $r2 + $p0 sub b32 $r1 $r1 $r2 + sched 0x20282e20042c28 + $p0 add b32 $r0 $r0 0x1 + $p0 set $p0 0x1 ge u32 $r1 $r2 + $p0 sub b32 $r1 $r1 $r2 + $p0 add b32 $r0 $r0 0x1 + ret + +// DIV S32, like DIV U32 after taking ABS(inputs) +// +// INPUT: $r0: dividend, $r1: divisor +// OUTPUT: $r0: result, $r1: modulus +// CLOBBER: $r2 - $r3, $p0 - $p3 +// +gk110_div_s32: + set $p2 0x1 lt s32 $r0 0x0 + set $p3 0x1 lt s32 $r1 0x0 xor $p2 + sched 0x28042804282820 + cvt s32 $r0 abs s32 $r0 + cvt s32 $r1 abs s32 $r1 + bfind u32 $r2 $r1 + xor b32 $r2 $r2 0x1f + mov b32 $r3 0x1 + shl b32 $r2 $r3 clamp $r2 + cvt u32 $r1 neg u32 $r1 + sched 0x28282828282828 + mul $r3 u32 $r1 u32 $r2 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mul $r3 u32 $r1 u32 $r2 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mul $r3 u32 $r1 u32 $r2 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mul $r3 u32 $r1 u32 $r2 + sched 0x28280428042828 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mul $r3 u32 $r1 u32 $r2 + add $r2 (mul high u32 $r2 u32 $r3) $r2 + mov b32 $r3 $r0 + mul high $r0 u32 $r0 u32 $r2 + cvt u32 $r2 neg u32 $r1 + add $r1 (mul u32 $r1 u32 $r0) $r3 + sched 0x2028042c28042c + set $p0 0x1 ge u32 $r1 $r2 + $p0 sub b32 $r1 $r1 $r2 + $p0 add b32 $r0 $r0 0x1 + $p0 set $p0 0x1 ge u32 $r1 $r2 + $p0 sub b32 $r1 $r1 $r2 + $p0 add b32 $r0 $r0 0x1 + $p3 cvt s32 $r0 neg s32 $r0 + sched 0x2c200428042e04 + $p2 cvt s32 $r1 neg s32 $r1 + ret + +gk110_rcp_f64: +gk110_rsq_f64: + ret + +.section #gk110_builtin_offsets +.b64 #gk110_div_u32 +.b64 #gk110_div_s32 +.b64 #gk110_rcp_f64 +.b64 #gk110_rsq_f64 diff --git a/src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h b/src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h new file mode 100644 index 0000000..8d00e2a --- /dev/null +++ b/src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h @@ -0,0 +1,81 @@ +uint64_t gk110_builtin_code[] = { +/* 0x0000: gk110_div_u32 */ + 0x08a0a0a010a010a0, + 0xe1800000009c000a, + 0x220000000f9c0808, + 0x74000000009fc00e, + 0xe2400000011c0c0a, + 0xe6010000009c2806, + 0xe1c00000011c040e, + 0xd2000800019c080a, + 0x08a0a0a0a0a0a0a0, + 0xe1c00000011c040e, + 0xd2000800019c080a, + 0xe1c00000011c040e, + 0xd2000800019c080a, + 0xe1c00000011c040e, + 0xd2000800019c080a, + 0xe1c00000011c040e, + 0x0810b0a0a010a010, + 0xd2000800019c080a, + 0xe4c03c00001c000e, + 0xe1c00400011c0002, + 0xe6010000009c280a, + 0xd0000c00001c0406, + 0xdb601c00011c041e, + 0xe088000001000406, + 0x0880a0b88010b0a0, + 0x4000000000800001, + 0xdb601c000100041e, + 0xe088000001000406, + 0x4000000000800001, + 0x19000000001c003c, +/* 0x00f0: gk110_div_s32 */ + 0xdb181c007f9c005e, + 0xdb1a08007f9c047e, + 0x08a010a010a0a080, + 0xe6100000001ce802, + 0xe6100000009ce806, + 0xe1800000009c000a, + 0x220000000f9c0808, + 0x74000000009fc00e, + 0xe2400000011c0c0a, + 0xe6010000009c2806, + 0x08a0a0a0a0a0a0a0, + 0xe1c00000011c040e, + 0xd2000800019c080a, + 0xe1c00000011c040e, + 0xd2000800019c080a, + 0xe1c00000011c040e, + 0xd2000800019c080a, + 0xe1c00000011c040e, + 0x08a0a010a010a0a0, + 0xd2000800019c080a, + 0xe1c00000011c040e, + 0xd2000800019c080a, + 0xe4c03c00001c000e, + 0xe1c00400011c0002, + 0xe6010000009c280a, + 0xd0000c00001c0406, + 0x0880a010b0a010b0, + 0xdb601c00011c041e, + 0xe088000001000406, + 0x4000000000800001, + 0xdb601c000100041e, + 0xe088000001000406, + 0x4000000000800001, + 0xe6010000000ce802, + 0x08b08010a010b810, + 0xe60100000088e806, + 0x19000000001c003c, +/* 0x0218: gk110_rcp_f64 */ +/* 0x0218: gk110_rsq_f64 */ + 0x19000000001c003c, +}; + +uint64_t gk110_builtin_offsets[] = { + 0x0000000000000000, + 0x00000000000000f0, + 0x0000000000000218, + 0x0000000000000218, +}; diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp index 395d5b5..adf2df8 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp @@ -39,26 +39,26 @@ TargetNVC0::TargetNVC0(unsigned int card) : Target(false, card >= 0xe4) // lazyness -> will just hardcode everything for the time being -#include "target_lib_nvc0.asm.h" -#include "target_lib_nve4.asm.h" -#include "target_lib_nvf0.asm.h" +#include "lib/gf100.asm.h" +#include "lib/gk104.asm.h" +#include "lib/gk110.asm.h" void TargetNVC0::getBuiltinCode(const uint32_t **code, uint32_t *size) const { switch (chipset & ~0xf) { case 0xe0: - *code = (const uint32_t *)&nve4_builtin_code[0]; - *size = sizeof(nve4_builtin_code); + *code = (const uint32_t *)&gk104_builtin_code[0]; + *size = sizeof(gk104_builtin_code); break; case 0xf0: case 0x100: - *code = (const uint32_t *)&nvf0_builtin_code[0]; - *size = sizeof(nvf0_builtin_code); + *code = (const uint32_t *)&gk110_builtin_code[0]; + *size = sizeof(gk110_builtin_code); break; default: - *code = (const uint32_t *)&nvc0_builtin_code[0]; - *size = sizeof(nvc0_builtin_code); + *code = (const uint32_t *)&gf100_builtin_code[0]; + *size = sizeof(gf100_builtin_code); break; } } @@ -70,12 +70,12 @@ TargetNVC0::getBuiltinOffset(int builtin) const switch (chipset & ~0xf) { case 0xe0: - return nve4_builtin_offsets[builtin]; + return gk104_builtin_offsets[builtin]; case 0xf0: case 0x100: - return nvf0_builtin_offsets[builtin]; + return gk110_builtin_offsets[builtin]; default: - return nvc0_builtin_offsets[builtin]; + return gf100_builtin_offsets[builtin]; } } diff --git a/src/gallium/drivers/nouveau/codegen/target_lib_nvc0.asm b/src/gallium/drivers/nouveau/codegen/target_lib_nvc0.asm deleted file mode 100644 index f40becc..0000000 --- a/src/gallium/drivers/nouveau/codegen/target_lib_nvc0.asm +++ /dev/null @@ -1,96 +0,0 @@ -// -// DIV U32 -// -// UNR recurrence (q = a / b): -// look for z such that 2^32 - b <= b * z < 2^32 -// then q - 1 <= (a * z) / 2^32 <= q -// -// INPUT: $r0: dividend, $r1: divisor -// OUTPUT: $r0: result, $r1: modulus -// CLOBBER: $r2 - $r3, $p0 - $p1 -// SIZE: 22 / 14 * 8 bytes -// -bfind u32 $r2 $r1 -xor b32 $r2 $r2 0x1f -mov b32 $r3 0x1 -shl b32 $r2 $r3 clamp $r2 -cvt u32 $r1 neg u32 $r1 -mul $r3 u32 $r1 u32 $r2 -add $r2 (mul high u32 $r2 u32 $r3) $r2 -mul $r3 u32 $r1 u32 $r2 -add $r2 (mul high u32 $r2 u32 $r3) $r2 -mul $r3 u32 $r1 u32 $r2 -add $r2 (mul high u32 $r2 u32 $r3) $r2 -mul $r3 u32 $r1 u32 $r2 -add $r2 (mul high u32 $r2 u32 $r3) $r2 -mul $r3 u32 $r1 u32 $r2 -add $r2 (mul high u32 $r2 u32 $r3) $r2 -mov b32 $r3 $r0 -mul high $r0 u32 $r0 u32 $r2 -cvt u32 $r2 neg u32 $r1 -add $r1 (mul u32 $r1 u32 $r0) $r3 -set $p0 0x1 ge u32 $r1 $r2 -$p0 sub b32 $r1 $r1 $r2 -$p0 add b32 $r0 $r0 0x1 -$p0 set $p0 0x1 ge u32 $r1 $r2 -$p0 sub b32 $r1 $r1 $r2 -$p0 add b32 $r0 $r0 0x1 -ret -// -// DIV S32, like DIV U32 after taking ABS(inputs) -// -// INPUT: $r0: dividend, $r1: divisor -// OUTPUT: $r0: result, $r1: modulus -// CLOBBER: $r2 - $r3, $p0 - $p3 -// -set $p2 0x1 lt s32 $r0 0x0 -set $p3 0x1 lt s32 $r1 0x0 xor $p2 -cvt s32 $r0 abs s32 $r0 -cvt s32 $r1 abs s32 $r1 -bfind u32 $r2 $r1 -xor b32 $r2 $r2 0x1f -mov b32 $r3 0x1 -shl b32 $r2 $r3 clamp $r2 -cvt u32 $r1 neg u32 $r1 -mul $r3 u32 $r1 u32 $r2 -add $r2 (mul high u32 $r2 u32 $r3) $r2 -mul $r3 u32 $r1 u32 $r2 -add $r2 (mul high u32 $r2 u32 $r3) $r2 -mul $r3 u32 $r1 u32 $r2 -add $r2 (mul high u32 $r2 u32 $r3) $r2 -mul $r3 u32 $r1 u32 $r2 -add $r2 (mul high u32 $r2 u32 $r3) $r2 -mul $r3 u32 $r1 u32 $r2 -add $r2 (mul high u32 $r2 u32 $r3) $r2 -mov b32 $r3 $r0 -mul high $r0 u32 $r0 u32 $r2 -cvt u32 $r2 neg u32 $r1 -add $r1 (mul u32 $r1 u32 $r0) $r3 -set $p0 0x1 ge u32 $r1 $r2 -$p0 sub b32 $r1 $r1 $r2 -$p0 add b32 $r0 $r0 0x1 -$p0 set $p0 0x1 ge u32 $r1 $r2 -$p0 sub b32 $r1 $r1 $r2 -$p0 add b32 $r0 $r0 0x1 -$p3 cvt s32 $r0 neg s32 $r0 -$p2 cvt s32 $r1 neg s32 $r1 -ret -// -// RCP F64: Newton Raphson reciprocal(x): r_{i+1} = r_i * (2.0 - x * r_i) -// -// INPUT: $r0d (x) -// OUTPUT: $r0d (rcp(x)) -// CLOBBER: $r2 - $r7 -// SIZE: 9 * 8 bytes -// -nop -ret -// RSQ F64: Newton Raphson rsqrt(x): r_{i+1} = r_i * (1.5 - 0.5 * x * r_i * r_i) -// -// INPUT: $r0d (x) -// OUTPUT: $r0d (rsqrt(x)) -// CLOBBER: $r2 - $r7 -// SIZE: 14 * 8 bytes -// -nop -ret diff --git a/src/gallium/drivers/nouveau/codegen/target_lib_nvc0.asm.h b/src/gallium/drivers/nouveau/codegen/target_lib_nvc0.asm.h deleted file mode 100644 index 3790504..0000000 --- a/src/gallium/drivers/nouveau/codegen/target_lib_nvc0.asm.h +++ /dev/null @@ -1,112 +0,0 @@ - -static const uint32_t nvc0_builtin_code[] = -{ - 0x04009c03, - 0x78000000, - 0x7c209cdd, - 0x0010dd18, - 0x08309c03, - 0x60000000, - 0x05605c18, - 0x0810dc2a, - 0x0c209c43, - 0x20040000, - 0x0810dc03, - 0x50000000, - 0x0c209c43, - 0x20040000, - 0x0810dc03, - 0x50000000, - 0x0c209c43, - 0x20040000, - 0x0810dc03, - 0x50000000, - 0x0c209c43, - 0x20040000, - 0x0810dc03, - 0x50000000, - 0x0c209c43, - 0x20040000, - 0x0000dde4, - 0x28000000, - 0x08001c43, - 0x50000000, - 0x05609c18, - 0x0010430d, - 0x0811dc03, - 0x1b0e0000, - 0x08104103, - 0x48000000, - 0x04000002, - 0x08000000, - 0x0811c003, - 0x1b0e0000, - 0x08104103, - 0x48000000, - 0x040000ac, - 0x90001dff, - 0xfc05dc23, - 0x188e0000, - 0xfc17dc23, - 0x18c40000, - 0x03301e18, - 0x07305e18, - 0x04009c03, - 0x78000000, - 0x7c209cdd, - 0x0010dd18, - 0x08309c03, - 0x60000000, - 0x05605c18, - 0x0810dc2a, - 0x0c209c43, - 0x20040000, - 0x0810dc03, - 0x50000000, - 0x0c209c43, - 0x20040000, - 0x0810dc03, - 0x50000000, - 0x0c209c43, - 0x20040000, - 0x0810dc03, - 0x50000000, - 0x0c209c43, - 0x20040000, - 0x0810dc03, - 0x50000000, - 0x0c209c43, - 0x20040000, - 0x0000dde4, - 0x28000000, - 0x08001c43, - 0x50000000, - 0x05609c18, - 0x0010430d, - 0x0811dc03, - 0x1b0e0000, - 0x08104103, - 0x48000000, - 0x04000002, - 0x08000000, - 0x0811c003, - 0x1b0e0000, - 0x08104103, - 0x48000000, - 0x040000ac, - 0x01700e18, - 0x05704a18, - 0x90001dff, - 0x00001c08, - 0x90001dff, - 0x00001c08, - 0x90001dff, -}; - -static const uint16_t nvc0_builtin_offsets[NVC0_BUILTIN_COUNT] = -{ - 0x0000, - 0x00b0, - 0x0180, - 0x0188 -}; diff --git a/src/gallium/drivers/nouveau/codegen/target_lib_nve4.asm.h b/src/gallium/drivers/nouveau/codegen/target_lib_nve4.asm.h deleted file mode 100644 index 53fa12c..0000000 --- a/src/gallium/drivers/nouveau/codegen/target_lib_nve4.asm.h +++ /dev/null @@ -1,592 +0,0 @@ - -// Assembled from target_lib_nve4.asm by envyas -m nvc0 -V nve4 -W. - -static const uint64_t nve4_builtin_code[] = -{ - 0x2282828042804287ULL, - 0x7800000004009c03ULL, - 0x380000007c209c82ULL, - 0x180000000400dde2ULL, - 0x6000000008309c03ULL, - 0x1c00000005205d04ULL, - 0x500000000810dc03ULL, - 0x200400000c209c43ULL, - 0x2282828282828287ULL, - 0x500000000810dc03ULL, - 0x200400000c209c43ULL, - 0x500000000810dc03ULL, - 0x200400000c209c43ULL, - 0x500000000810dc03ULL, - 0x200400000c209c43ULL, - 0x500000000810dc03ULL, - 0x2042c28280428047ULL, - 0x200400000c209c43ULL, - 0x280000000000dde4ULL, - 0x5000000008001c43ULL, - 0x1c00000005209d04ULL, - 0x2006000000105c03ULL, - 0x1b0e00000811dc03ULL, - 0x4800000008104103ULL, - 0x220282e20042c287ULL, - 0x0800000004000002ULL, - 0x1b0e00000811c003ULL, - 0x4800000008104103ULL, - 0x0800000004000002ULL, - 0x9000000000001de7ULL, - 0x188e0000fc05dc23ULL, - 0x18c40000fc17dc23ULL, - 0x2280428042828207ULL, - 0x1c00000001201ec4ULL, - 0x1c00000005205ec4ULL, - 0x7800000004009c03ULL, - 0x380000007c209c82ULL, - 0x180000000400dde2ULL, - 0x6000000008309c03ULL, - 0x1c00000005205d04ULL, - 0x2282828282828287ULL, - 0x500000000810dc03ULL, - 0x200400000c209c43ULL, - 0x500000000810dc03ULL, - 0x200400000c209c43ULL, - 0x500000000810dc03ULL, - 0x200400000c209c43ULL, - 0x500000000810dc03ULL, - 0x2282804280428287ULL, - 0x200400000c209c43ULL, - 0x500000000810dc03ULL, - 0x200400000c209c43ULL, - 0x280000000000dde4ULL, - 0x5000000008001c43ULL, - 0x1c00000005209d04ULL, - 0x2006000000105c03ULL, - 0x22028042c28042c7ULL, - 0x1b0e00000811dc03ULL, - 0x4800000008104103ULL, - 0x0800000004000002ULL, - 0x1b0e00000811c003ULL, - 0x4800000008104103ULL, - 0x0800000004000002ULL, - 0x1c00000001200f84ULL, - 0x22c200428042e047ULL, - 0x1c00000005204b84ULL, - 0x9000000000001de7ULL, - 0xd4004000084004c5ULL, - 0x0c5400000013dc04ULL, - 0xd4004000084009c5ULL, - 0xd4004000084007c5ULL, - 0x9000000000001de7ULL, - 0x2000000000000007ULL, - 0xd4004000084004c5ULL, - 0x0c5400000013dc04ULL, - 0xd4004000084009c5ULL, - 0xd4004000084007c5ULL, - 0x1900000004a0dc04ULL, - 0x1800000004a09c04ULL, - 0x30de0001d030dc02ULL, - 0x2000000000000007ULL, - 0x1900000000a05c04ULL, - 0x30de0001d0209c02ULL, - 0x1800000000a01c04ULL, - 0x30de0001d0105c02ULL, - 0x30de0001d0001c02ULL, - 0x9000000000001de7ULL, - 0xd4004000084004a5ULL, - 0x2000000000000007ULL, - 0x0c5400000013dc04ULL, - 0xd4004000084009a5ULL, - 0xd4004000084007a5ULL, - 0x1900000004a0de04ULL, - 0x1800000004a09e04ULL, - 0x30e000061c30dc02ULL, - 0x1900000000a05e04ULL, - 0x2000000000000007ULL, - 0x30e000061c209c02ULL, - 0x1800000000a01e04ULL, - 0x30e000061c105c02ULL, - 0x30e000061c001c02ULL, - 0x9000000000001de7ULL, - 0xd4004000084004a5ULL, - 0x0c5400000013dc04ULL, - 0x2000000000000007ULL, - 0xd4004000084009a5ULL, - 0xd4004000084007a5ULL, - 0x1d00000004a0de84ULL, - 0x1c00000004a09e84ULL, - 0x1d00000000a05e84ULL, - 0x1c00000000a01e84ULL, - 0x9000000000001de7ULL, - 0x2000000000000007ULL, - 0xd4004000084004a5ULL, - 0x0c5400000013dc04ULL, - 0xd4004000084009a5ULL, - 0xd4004000084007a5ULL, - 0x1d00000004a0dc04ULL, - 0x1c00000004a09c04ULL, - 0x1d00000000a05c04ULL, - 0x2000000000000007ULL, - 0x1c00000000a01c04ULL, - 0x9000000000001de7ULL, - 0xd4004000084004a5ULL, - 0x0c5400000013dc04ULL, - 0xd4004000084009a5ULL, - 0xd4004000084007a5ULL, - 0x1100000004a0dc04ULL, - 0x2000000000000007ULL, - 0x1000000004a09c04ULL, - 0x1100000000a05c04ULL, - 0x1000000000a01c04ULL, - 0x9000000000001de7ULL, - 0xd4004000084004a5ULL, - 0x0c5400000013dc04ULL, - 0xd4004000084009a5ULL, - 0x2000000000000007ULL, - 0xd4004000084007a5ULL, - 0x1800000000009de2ULL, - 0x18fe00000000dde2ULL, - 0x9000000000001de7ULL, - 0xd4004000084004a5ULL, - 0x0c5400000013dc04ULL, - 0xd4004000084009a5ULL, - 0x2000000000000007ULL, - 0xd4004000084007a5ULL, - 0x1800000000009de2ULL, - 0x180000000400dde2ULL, - 0x9000000000001de7ULL, - 0xd400400008400485ULL, - 0x0c5400000013dc04ULL, - 0xd400400008400985ULL, - 0x2000000000000007ULL, - 0xd400400008400785ULL, - 0x7000c02828005c03ULL, - 0x18fe00000000dde2ULL, - 0x7000c02850009c03ULL, - 0x3800000ffc001c02ULL, - 0x1800000008a09c04ULL, - 0x1800000004a05c04ULL, - 0x2000000000000007ULL, - 0x30ea00801c209c02ULL, - 0x1800000000a01c04ULL, - 0x30ea00801c105c02ULL, - 0x30ea00801c001c02ULL, - 0x9000000000001de7ULL, - 0xd400400008400485ULL, - 0x0c5400000013dc04ULL, - 0x2000000000000007ULL, - 0xd400400008400985ULL, - 0xd400400008400785ULL, - 0x7000c02828005c03ULL, - 0x180000000400dde2ULL, - 0x7000c02850009c03ULL, - 0x3800000ffc001c02ULL, - 0x9000000000001de7ULL, - 0x2000000000000007ULL, - 0xd400400008400485ULL, - 0x0c5400000013dc04ULL, - 0xd400400008400985ULL, - 0xd400400008400785ULL, - 0x198000000020dc04ULL, - 0x1900000000209c04ULL, - 0x30ee02020430dc02ULL, - 0x2000000000000007ULL, - 0x1880000000205c04ULL, - 0x30ee020204209c02ULL, - 0x1800000000201c04ULL, - 0x30ee020204105c02ULL, - 0x30ee020204001c02ULL, - 0x9000000000001de7ULL, - 0xd400400008400485ULL, - 0x2000000000000007ULL, - 0x0c5400000013dc04ULL, - 0xd400400008400985ULL, - 0xd400400008400785ULL, - 0x198000000020de04ULL, - 0x1900000000209e04ULL, - 0x30f004081030dc02ULL, - 0x1880000000205e04ULL, - 0x2000000000000007ULL, - 0x30f0040810209c02ULL, - 0x1800000000201e04ULL, - 0x30f0040810105c02ULL, - 0x30f0040810001c02ULL, - 0x9000000000001de7ULL, - 0xd400400008400485ULL, - 0x0c5400000013dc04ULL, - 0x2000000000000007ULL, - 0xd400400008400985ULL, - 0xd400400008400785ULL, - 0x1d8000000020de84ULL, - 0x1d00000000209e84ULL, - 0x1c80000000205e84ULL, - 0x1c00000000201e84ULL, - 0x9000000000001de7ULL, - 0x2000000000000007ULL, - 0xd400400008400485ULL, - 0x0c5400000013dc04ULL, - 0xd400400008400985ULL, - 0xd400400008400785ULL, - 0x1d8000000020dc04ULL, - 0x1d00000000209c04ULL, - 0x1c80000000205c04ULL, - 0x2000000000000007ULL, - 0x1c00000000201c04ULL, - 0x9000000000001de7ULL, - 0xd400400008400445ULL, - 0x0c5400000013dc04ULL, - 0xd400400008400945ULL, - 0xd400400008400745ULL, - 0x7000c01814005c03ULL, - 0x2000000000000007ULL, - 0x18fe00000000dde2ULL, - 0x7000c0142c009c03ULL, - 0x380000007c001c02ULL, - 0x1800000008209c04ULL, - 0x1800000004205c04ULL, - 0x30f4108420209c02ULL, - 0x1800000000201c04ULL, - 0x2000000000000007ULL, - 0x30f2082084105c02ULL, - 0x30f4108420001c02ULL, - 0x9000000000001de7ULL, - 0xd400400008400445ULL, - 0x0c5400000013dc04ULL, - 0xd400400008400945ULL, - 0xd400400008400745ULL, - 0x2000000000000007ULL, - 0x7000c01414005c03ULL, - 0x7000c01428009c03ULL, - 0x380000007c001c02ULL, - 0x18fe00000000dde2ULL, - 0x1800000008209c04ULL, - 0x1800000004205c04ULL, - 0x1800000000201c04ULL, - 0x2000000000000007ULL, - 0x30f4108420209c02ULL, - 0x30f4108420105c02ULL, - 0x30f4108420001c02ULL, - 0x9000000000001de7ULL, - 0xd400400008400485ULL, - 0x0c5400000013dc04ULL, - 0xd400400008400985ULL, - 0x2000000000000007ULL, - 0xd400400008400785ULL, - 0x1900000000a05c04ULL, - 0x1800000000a01c04ULL, - 0x30de0001d0105c02ULL, - 0x30de0001d0001c02ULL, - 0x1800000000009de2ULL, - 0x18fe00000000dde2ULL, - 0x2000000000000007ULL, - 0x9000000000001de7ULL, - 0xd400400008400485ULL, - 0x0c5400000013dc04ULL, - 0xd400400008400985ULL, - 0xd400400008400785ULL, - 0x18fe00000000dde2ULL, - 0x1900000000a05e04ULL, - 0x2000000000000007ULL, - 0x1800000000009de2ULL, - 0x1800000000a01e04ULL, - 0x30e000061c105c02ULL, - 0x30e000061c001c02ULL, - 0x9000000000001de7ULL, - 0xd400400008400485ULL, - 0x0c5400000013dc04ULL, - 0x2000000000000007ULL, - 0xd400400008400985ULL, - 0xd400400008400785ULL, - 0x180000000400dde2ULL, - 0x1d00000000a05e84ULL, - 0x1800000000009de2ULL, - 0x1c00000000a01e84ULL, - 0x9000000000001de7ULL, - 0x2000000000000007ULL, - 0xd400400008400485ULL, - 0x0c5400000013dc04ULL, - 0xd400400008400985ULL, - 0xd400400008400785ULL, - 0x180000000400dde2ULL, - 0x1d00000000a05c04ULL, - 0x1800000000009de2ULL, - 0x2000000000000007ULL, - 0x1c00000000a01c04ULL, - 0x9000000000001de7ULL, - 0xd400400008400485ULL, - 0x0c5400000013dc04ULL, - 0xd400400008400985ULL, - 0xd400400008400785ULL, - 0x18fe00000000dde2ULL, - 0x2000000000000007ULL, - 0x1100000000a05c04ULL, - 0x1800000000009de2ULL, - 0x1000000000a01c04ULL, - 0x9000000000001de7ULL, - 0xd400400008400485ULL, - 0x0c5400000013dc04ULL, - 0xd400400008400985ULL, - 0x2000000000000007ULL, - 0xd400400008400785ULL, - 0x18fe00000000dde2ULL, - 0x1800000000009de2ULL, - 0x1800000000005de2ULL, - 0x9000000000001de7ULL, - 0xd400400008400485ULL, - 0x0c5400000013dc04ULL, - 0x2000000000000007ULL, - 0xd400400008400985ULL, - 0xd400400008400785ULL, - 0x180000000400dde2ULL, - 0x1800000000009de2ULL, - 0x1800000000005de2ULL, - 0x9000000000001de7ULL, - 0xd400400008400445ULL, - 0x2000000000000007ULL, - 0x0c5400000013dc04ULL, - 0xd400400008400945ULL, - 0xd400400008400745ULL, - 0x18fe00000000dde2ULL, - 0x1880000000205c04ULL, - 0x1800000000009de2ULL, - 0x1800000000201c04ULL, - 0x2000000000000007ULL, - 0x30ee020204105c02ULL, - 0x30ee020204001c02ULL, - 0x9000000000001de7ULL, - 0xd400400008400445ULL, - 0x0c5400000013dc04ULL, - 0xd400400008400945ULL, - 0xd400400008400745ULL, - 0x2000000000000007ULL, - 0x18fe00000000dde2ULL, - 0x1880000000205e04ULL, - 0x1800000000009de2ULL, - 0x1800000000201e04ULL, - 0x30f0040810105c02ULL, - 0x30f0040810001c02ULL, - 0x9000000000001de7ULL, - 0x2000000000000007ULL, - 0xd400400008400445ULL, - 0x0c5400000013dc04ULL, - 0xd400400008400945ULL, - 0xd400400008400745ULL, - 0x180000000400dde2ULL, - 0x1c80000000205c04ULL, - 0x1800000000009de2ULL, - 0x2000000000000007ULL, - 0x1c00000000201c04ULL, - 0x9000000000001de7ULL, - 0xd400400008400445ULL, - 0x0c5400000013dc04ULL, - 0xd400400008400945ULL, - 0xd400400008400745ULL, - 0x180000000400dde2ULL, - 0x2000000000000007ULL, - 0x1c80000000205e84ULL, - 0x1800000000009de2ULL, - 0x1c00000000201e84ULL, - 0x9000000000001de7ULL, - 0xd400400008400445ULL, - 0x0c5400000013dc04ULL, - 0xd400400008400945ULL, - 0x2000000000000007ULL, - 0xd400400008400745ULL, - 0x18fe00000000dde2ULL, - 0x1800000000a01c04ULL, - 0x1800000000009de2ULL, - 0x1800000000005de2ULL, - 0x30de0001d0001c02ULL, - 0x9000000000001de7ULL, - 0x2000000000000007ULL, - 0xd400400008400445ULL, - 0x0c5400000013dc04ULL, - 0xd400400008400945ULL, - 0xd400400008400745ULL, - 0x18fe00000000dde2ULL, - 0x1800000000a01e04ULL, - 0x1800000000009de2ULL, - 0x2000000000000007ULL, - 0x1800000000005de2ULL, - 0x30e000061c001c02ULL, - 0x9000000000001de7ULL, - 0xd400400008400465ULL, - 0x0c5400000013dc04ULL, - 0xd400400008400965ULL, - 0xd400400008400765ULL, - 0x2000000000000007ULL, - 0x180000000400dde2ULL, - 0x1800000000009de2ULL, - 0x1800000000005de2ULL, - 0x9000000000001de7ULL, - 0xd400400008400445ULL, - 0x0c5400000013dc04ULL, - 0xd400400008400945ULL, - 0x2000000000000007ULL, - 0xd400400008400745ULL, - 0x180000000400dde2ULL, - 0x1800000000009de2ULL, - 0x1800000000005de2ULL, - 0x9000000000001de7ULL, - 0xd400400008400445ULL, - 0x0c5400000013dc04ULL, - 0x2000000000000007ULL, - 0xd400400008400945ULL, - 0xd400400008400745ULL, - 0x18fe00000000dde2ULL, - 0x1800000000009de2ULL, - 0x1000000000a01c04ULL, - 0x1800000000005de2ULL, - 0x9000000000001de7ULL, - 0x2000000000000007ULL, - 0xd400400008400405ULL, - 0x0c5400000013dc04ULL, - 0xd400400008400905ULL, - 0xd400400008400705ULL, - 0x18fe00000000dde2ULL, - 0x1800000000201c04ULL, - 0x1800000000009de2ULL, - 0x2000000000000007ULL, - 0x30ee020204001c02ULL, - 0x1800000000005de2ULL, - 0x9000000000001de7ULL, - 0xd400400008400405ULL, - 0x0c5400000013dc04ULL, - 0xd400400008400905ULL, - 0xd400400008400705ULL, - 0x2000000000000007ULL, - 0x18fe00000000dde2ULL, - 0x1800000000201e04ULL, - 0x1800000000009de2ULL, - 0x30f0040810001c02ULL, - 0x1800000000005de2ULL, - 0x9000000000001de7ULL, - 0xd400400008400425ULL, - 0x2000000000000007ULL, - 0x0c5400000013dc04ULL, - 0xd400400008400925ULL, - 0xd400400008400725ULL, - 0x180000000400dde2ULL, - 0x1800000000009de2ULL, - 0x1800000000005de2ULL, - 0x9000000000001de7ULL, - 0x2000000000000007ULL, - 0xd400400008400405ULL, - 0x0c5400000013dc04ULL, - 0xd400400008400905ULL, - 0xd400400008400705ULL, - 0x180000000400dde2ULL, - 0x1800000000009de2ULL, - 0x1800000000005de2ULL, - 0x2000000000000007ULL, - 0x9000000000001de7ULL, - 0xd40040000840c485ULL, - 0x0c5400000013dc04ULL, - 0xd40040000840c985ULL, - 0xd40040000840c785ULL, - 0x18fe00000000dde2ULL, - 0x4000000000001de4ULL, - 0x9000000000001de7ULL, - 0x4000000000001de4ULL, - 0x9000000000001de7ULL, - 0x4000000000001de4ULL, - 0x9000000000001de7ULL, - 0xc800000003f01cc5ULL, - 0x2c00000100005c04ULL, - 0x2c0000010800dc04ULL, - 0x3000c3fffff09c04ULL, - 0x680100000c1fdc03ULL, - 0x4000000a60001c47ULL, - 0x180000004000dde2ULL, - 0x78000009c0000007ULL, - 0x0c0000000430dd02ULL, - 0x4003ffffa0001ca7ULL, - 0x2800406400001de4ULL, - 0x2800406410005de4ULL, - 0x180000000400dde2ULL, - 0x547e18000000dd05ULL, - 0x60000008e0000007ULL, - 0x190ec0000431dc03ULL, - 0x40000000000001f4ULL, - 0x94000004c0009c85ULL, - 0x2c00000100009c04ULL, - 0x2c0000010800dc04ULL, - 0x9400000020009ca5ULL, - 0x9400000100011cc5ULL, - 0x9400000140021cc5ULL, - 0x9400000180031cc5ULL, - 0x94000001c0041cc5ULL, - 0x9400000200051cc5ULL, - 0x9400000240061cc5ULL, - 0x9400000280071cc5ULL, - 0x94000002c0081cc5ULL, - 0x9400000300091cc5ULL, - 0x94000003400a1cc5ULL, - 0x94000003800b1cc5ULL, - 0x94000003c00c1cc5ULL, - 0x94000004000d1cc5ULL, - 0x94000004400e1cc5ULL, - 0x94000004800f1cc5ULL, - 0xc000000003f09ea5ULL, - 0x94000000c0009ca5ULL, - 0xc000000023f09ea5ULL, - 0x94000000e0009ca5ULL, - 0x2c00000084009c04ULL, - 0x2c0000008800dc04ULL, - 0x9400000040009ca5ULL, - 0x2c0000008c009c04ULL, - 0x2c0000009400dc04ULL, - 0x9400000060009ca5ULL, - 0x2c00000098009c04ULL, - 0x2c0000009c00dc04ULL, - 0x9400000080009ca5ULL, - 0x2c000000c800dc04ULL, - 0x0c0000001030dd02ULL, - 0x4000000100001ea7ULL, - 0x480100000c001c03ULL, - 0x0800000000105c42ULL, - 0xc100000000309c85ULL, - 0x9400000500009c85ULL, - 0x0c00000010001d02ULL, - 0x0800000000105d42ULL, - 0x0c0000001030dd02ULL, - 0x4003ffff40001ca7ULL, - 0x2800406420001de4ULL, - 0x2800406430005de4ULL, - 0xe000000000001c45ULL, - 0xd000000003ffdcc5ULL, - 0x9c000000000fdcc5ULL, - 0x2c0000000c009c04ULL, - 0x7000c0205020dc03ULL, - 0x7000c01820209c03ULL, - 0x5000406450209c03ULL, - 0x500040644030dc03ULL, - 0x480000000c209c03ULL, - 0x4801000008001c03ULL, - 0x0800000000105c42ULL, - 0x280040646000dde4ULL, - 0x8400000020009f05ULL, - 0x190ec0002821dc03ULL, - 0x40000000800001e7ULL, - 0x0c00000040001c02ULL, - 0x0800000000105c42ULL, - 0x0c0000004030dd02ULL, - 0x00029dff0ffc5cbfULL, - 0x8400000000009f85ULL, - 0x2800406400001de4ULL, - 0x2800406410005de4ULL, - 0x9400000010009c85ULL, - 0x4000000000001df4ULL, - 0x9800000003ffdcc5ULL, - 0xd000000000008007ULL, - 0xa000000000004007ULL, - 0xd000000000008007ULL, - 0x3400c3fffc201c04ULL, - 0xc000000003f01ec5ULL, - 0xa000000000000007ULL -}; - -static const uint16_t nve4_builtin_offsets[NVC0_BUILTIN_COUNT] = -{ - 0x0000, - 0x00f0, - 0x0f08, - 0x0f18, -}; diff --git a/src/gallium/drivers/nouveau/codegen/target_lib_nvf0.asm b/src/gallium/drivers/nouveau/codegen/target_lib_nvf0.asm deleted file mode 100644 index a0c5497..0000000 --- a/src/gallium/drivers/nouveau/codegen/target_lib_nvf0.asm +++ /dev/null @@ -1,86 +0,0 @@ -// -// DIV U32 -// -// UNR recurrence (q = a / b): -// look for z such that 2^32 - b <= b * z < 2^32 -// then q - 1 <= (a * z) / 2^32 <= q -// -// INPUT: $r0: dividend, $r1: divisor -// OUTPUT: $r0: result, $r1: modulus -// CLOBBER: $r2 - $r3, $p0 - $p1 -// SIZE: 22 / 14 * 8 bytes -// -sched 0x28282804280428 -bfind u32 $r2 $r1 -xor b32 $r2 $r2 0x1f -mov b32 $r3 0x1 -shl b32 $r2 $r3 clamp $r2 -cvt u32 $r1 neg u32 $r1 -mul $r3 u32 $r1 u32 $r2 -add $r2 (mul high u32 $r2 u32 $r3) $r2 -sched 0x28282828282828 -mul $r3 u32 $r1 u32 $r2 -add $r2 (mul high u32 $r2 u32 $r3) $r2 -mul $r3 u32 $r1 u32 $r2 -add $r2 (mul high u32 $r2 u32 $r3) $r2 -mul $r3 u32 $r1 u32 $r2 -add $r2 (mul high u32 $r2 u32 $r3) $r2 -mul $r3 u32 $r1 u32 $r2 -sched 0x042c2828042804 -add $r2 (mul high u32 $r2 u32 $r3) $r2 -mov b32 $r3 $r0 -mul high $r0 u32 $r0 u32 $r2 -cvt u32 $r2 neg u32 $r1 -add $r1 (mul u32 $r1 u32 $r0) $r3 -set $p0 0x1 ge u32 $r1 $r2 -$p0 sub b32 $r1 $r1 $r2 -sched 0x20282e20042c28 -$p0 add b32 $r0 $r0 0x1 -$p0 set $p0 0x1 ge u32 $r1 $r2 -$p0 sub b32 $r1 $r1 $r2 -$p0 add b32 $r0 $r0 0x1 -ret -// -// DIV S32, like DIV U32 after taking ABS(inputs) -// -// INPUT: $r0: dividend, $r1: divisor -// OUTPUT: $r0: result, $r1: modulus -// CLOBBER: $r2 - $r3, $p0 - $p3 -// -set $p2 0x1 lt s32 $r0 0x0 -set $p3 0x1 lt s32 $r1 0x0 xor $p2 -sched 0x28042804282820 -cvt s32 $r0 abs s32 $r0 -cvt s32 $r1 abs s32 $r1 -bfind u32 $r2 $r1 -xor b32 $r2 $r2 0x1f -mov b32 $r3 0x1 -shl b32 $r2 $r3 clamp $r2 -cvt u32 $r1 neg u32 $r1 -sched 0x28282828282828 -mul $r3 u32 $r1 u32 $r2 -add $r2 (mul high u32 $r2 u32 $r3) $r2 -mul $r3 u32 $r1 u32 $r2 -add $r2 (mul high u32 $r2 u32 $r3) $r2 -mul $r3 u32 $r1 u32 $r2 -add $r2 (mul high u32 $r2 u32 $r3) $r2 -mul $r3 u32 $r1 u32 $r2 -sched 0x28280428042828 -add $r2 (mul high u32 $r2 u32 $r3) $r2 -mul $r3 u32 $r1 u32 $r2 -add $r2 (mul high u32 $r2 u32 $r3) $r2 -mov b32 $r3 $r0 -mul high $r0 u32 $r0 u32 $r2 -cvt u32 $r2 neg u32 $r1 -add $r1 (mul u32 $r1 u32 $r0) $r3 -sched 0x2028042c28042c -set $p0 0x1 ge u32 $r1 $r2 -$p0 sub b32 $r1 $r1 $r2 -$p0 add b32 $r0 $r0 0x1 -$p0 set $p0 0x1 ge u32 $r1 $r2 -$p0 sub b32 $r1 $r1 $r2 -$p0 add b32 $r0 $r0 0x1 -$p3 cvt s32 $r0 neg s32 $r0 -sched 0x2c200428042e04 -$p2 cvt s32 $r1 neg s32 $r1 -ret diff --git a/src/gallium/drivers/nouveau/codegen/target_lib_nvf0.asm.h b/src/gallium/drivers/nouveau/codegen/target_lib_nvf0.asm.h deleted file mode 100644 index 02c1ec6..0000000 --- a/src/gallium/drivers/nouveau/codegen/target_lib_nvf0.asm.h +++ /dev/null @@ -1,84 +0,0 @@ - -// Assembled from target_lib_nvf0.asm by envyas -m gk110 -W. - -static const uint64_t nvf0_builtin_code[] = -{ -// DIV U32 -0x08a0a0a010a010a0ULL, -0xe1800000009c000aULL, -0x220000000f9c0808ULL, -0x74000000009fc00eULL, -0xe2400000011c0c0aULL, -0xe6010000009c2806ULL, -0xe1c00000011c040eULL, -0xd2000800019c080aULL, -0x08a0a0a0a0a0a0a0ULL, -0xe1c00000011c040eULL, -0xd2000800019c080aULL, -0xe1c00000011c040eULL, -0xd2000800019c080aULL, -0xe1c00000011c040eULL, -0xd2000800019c080aULL, -0xe1c00000011c040eULL, -0x0810b0a0a010a010ULL, -0xd2000800019c080aULL, -0xe4c03c00001c000eULL, -0xe1c00400011c0002ULL, -0xe6010000009c280aULL, -0xd0000c00001c0406ULL, -0xdb601c00011c041eULL, -0xe088000001000406ULL, -0x0880a0b88010b0a0ULL, -0x4000000000800001ULL, -0xdb601c000100041eULL, -0xe088000001000406ULL, -0x4000000000800001ULL, -0x19000000001c003cULL, -// DIV S32 -0xdb181c007f9c005eULL, -0xdb1a08007f9c047eULL, -0x08a010a010a0a080ULL, -0xe6100000001ce802ULL, -0xe6100000009ce806ULL, -0xe1800000009c000aULL, -0x220000000f9c0808ULL, -0x74000000009fc00eULL, -0xe2400000011c0c0aULL, -0xe6010000009c2806ULL, -0x08a0a0a0a0a0a0a0ULL, -0xe1c00000011c040eULL, -0xd2000800019c080aULL, -0xe1c00000011c040eULL, -0xd2000800019c080aULL, -0xe1c00000011c040eULL, -0xd2000800019c080aULL, -0xe1c00000011c040eULL, -0x08a0a010a010a0a0ULL, -0xd2000800019c080aULL, -0xe1c00000011c040eULL, -0xd2000800019c080aULL, -0xe4c03c00001c000eULL, -0xe1c00400011c0002ULL, -0xe6010000009c280aULL, -0xd0000c00001c0406ULL, -0x0880a010b0a010b0ULL, -0xdb601c00011c041eULL, -0xe088000001000406ULL, -0x4000000000800001ULL, -0xdb601c000100041eULL, -0xe088000001000406ULL, -0x4000000000800001ULL, -0xe6010000000ce802ULL, -0x08b08010a010b810ULL, -0xe60100000088e806ULL, -0x19000000001c003cULL, -}; - -static const uint16_t nvf0_builtin_offsets[NVC0_BUILTIN_COUNT] = -{ - 0x0000, - 0x00f0, - /* Just point at a ret instruction for now. */ - 0x00f0 - 8, - 0x00f0 - 8 -}; -- 2.7.4