// produce a result where res[63:32]=0 and res[31]=1.
def : Pat<(srem (sexti32 (i64 GPR:$rs1)), (sexti32 (i64 GPR:$rs2))),
(REMW GPR:$rs1, GPR:$rs2)>;
+
+// Special case for calculating the full 64-bit product of a 32x32 unsigned
+// multiply where the inputs aren't known to be zero extended. We can shift the
+// inputs left by 32 and use a MULHU. This saves two SRLIs needed to finish
+// zeroing the upper 32 bits.
+// TODO: If one of the operands is zero extended and the other isn't, we might
+// still be better off shifting both left by 32.
+def : Pat<(i64 (mul (and GPR:$rs1, 0xffffffff), (and GPR:$rs2, 0xffffffff))),
+ (MULHU (SLLI GPR:$rs1, 32), (SLLI GPR:$rs2, 32))>;
} // Predicates = [HasStdExtM, IsRV64]
; CHECK-NEXT: add a2, a2, a1
; CHECK-NEXT: addi a3, a0, 1
; CHECK-NEXT: mul a3, a2, a3
-; CHECK-NEXT: slli a2, a2, 32
-; CHECK-NEXT: srli a2, a2, 32
; CHECK-NEXT: sub a1, a1, a0
; CHECK-NEXT: addi a1, a1, -2
; CHECK-NEXT: slli a1, a1, 32
-; CHECK-NEXT: srli a1, a1, 32
-; CHECK-NEXT: mul a1, a2, a1
+; CHECK-NEXT: slli a2, a2, 32
+; CHECK-NEXT: mulhu a1, a2, a1
; CHECK-NEXT: srli a1, a1, 1
; CHECK-NEXT: add a0, a3, a0
; CHECK-NEXT: addw a0, a0, a1
; CHECK-NEXT: not a2, a0
; CHECK-NEXT: add a3, a2, a1
; CHECK-NEXT: mul a2, a3, a2
-; CHECK-NEXT: slli a3, a3, 32
-; CHECK-NEXT: srli a3, a3, 32
; CHECK-NEXT: sub a1, a1, a0
; CHECK-NEXT: addi a1, a1, -2
; CHECK-NEXT: slli a1, a1, 32
-; CHECK-NEXT: srli a1, a1, 32
-; CHECK-NEXT: mul a1, a3, a1
+; CHECK-NEXT: slli a3, a3, 32
+; CHECK-NEXT: mulhu a1, a3, a1
; CHECK-NEXT: srli a1, a1, 1
; CHECK-NEXT: sub a0, a2, a0
; CHECK-NEXT: subw a0, a0, a1
; RV64-LABEL: umulo.i32:
; RV64: # %bb.0: # %entry
; RV64-NEXT: slli a1, a1, 32
-; RV64-NEXT: srli a1, a1, 32
; RV64-NEXT: slli a0, a0, 32
-; RV64-NEXT: srli a0, a0, 32
-; RV64-NEXT: mul a1, a0, a1
+; RV64-NEXT: mulhu a1, a0, a1
; RV64-NEXT: srli a0, a1, 32
; RV64-NEXT: snez a0, a0
; RV64-NEXT: sw a1, 0(a2)
; RV64-LABEL: umulo.select.i32:
; RV64: # %bb.0: # %entry
; RV64-NEXT: slli a2, a1, 32
-; RV64-NEXT: srli a2, a2, 32
; RV64-NEXT: slli a3, a0, 32
-; RV64-NEXT: srli a3, a3, 32
-; RV64-NEXT: mul a2, a3, a2
+; RV64-NEXT: mulhu a2, a3, a2
; RV64-NEXT: srli a2, a2, 32
; RV64-NEXT: bnez a2, .LBB42_2
; RV64-NEXT: # %bb.1: # %entry
; RV64-LABEL: umulo.not.i32:
; RV64: # %bb.0: # %entry
; RV64-NEXT: slli a1, a1, 32
-; RV64-NEXT: srli a1, a1, 32
; RV64-NEXT: slli a0, a0, 32
-; RV64-NEXT: srli a0, a0, 32
-; RV64-NEXT: mul a0, a0, a1
+; RV64-NEXT: mulhu a0, a0, a1
; RV64-NEXT: srli a0, a0, 32
; RV64-NEXT: seqz a0, a0
; RV64-NEXT: ret
; RV64-LABEL: umulo.br.i32:
; RV64: # %bb.0: # %entry
; RV64-NEXT: slli a1, a1, 32
-; RV64-NEXT: srli a1, a1, 32
; RV64-NEXT: slli a0, a0, 32
-; RV64-NEXT: srli a0, a0, 32
-; RV64-NEXT: mul a0, a0, a1
+; RV64-NEXT: mulhu a0, a0, a1
; RV64-NEXT: srli a0, a0, 32
; RV64-NEXT: beqz a0, .LBB57_2
; RV64-NEXT: # %bb.1: # %overflow