From 9c4cddb53a7b94d83d1a7417c9a1aea00a139545 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Thu, 10 Dec 2020 16:34:55 +0000
Subject: [PATCH] [Clang] Add vcmla and rotated variants for Arm ACLE.

This patch adds vcmla and the rotated variants as defined in
"Arm Neon Intrinsics Reference for ACLE Q3 2020" [1]

The *_lane_* are still missing, but they can be added separately.

This patch only adds the builtin mapping for AArch64.

[1] https://developer.arm.com/documentation/ihi0073/latest

Reviewed By: t.p.northover

Differential Revision: https://reviews.llvm.org/D92930
---
 clang/include/clang/Basic/arm_neon.td   |  12 +++
 clang/lib/CodeGen/CGBuiltin.cpp         |   8 ++
 clang/test/CodeGen/aarch64-neon-vcmla.c | 146 ++++++++++++++++++++++++++++++++
 3 files changed, 166 insertions(+)
 create mode 100644 clang/test/CodeGen/aarch64-neon-vcmla.c
diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index feccf2e..4d4e42d 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -1902,22 +1902,34 @@ let ArchGuard = "defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC)" in {
   def VFMLALT_LANEQ_BF : SOpInst<"vbfmlalt_laneq", "..B(BQ)I", "Qf", OP_BFMLALT_LN>;
 }
 
+multiclass VCMLA_ROTS<string type, string lanety, string laneqty> {
+  foreach ROT = ["", "_rot90", "_rot180", "_rot270" ] in {
+    def   : SInst<"vcmla" # ROT, "....", type # "Q" # type>;
+  }
+}
+
 // v8.3-A Vector complex addition intrinsics
 let ArchGuard = "defined(__ARM_FEATURE_COMPLEX) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)" in {
   def VCADD_ROT90_FP16   : SInst<"vcadd_rot90", "...", "h">;
   def VCADD_ROT270_FP16  : SInst<"vcadd_rot270", "...", "h">;
   def VCADDQ_ROT90_FP16  : SInst<"vcaddq_rot90", "QQQ", "h">;
   def VCADDQ_ROT270_FP16 : SInst<"vcaddq_rot270", "QQQ", "h">;
+
+  defm VCMLA_FP16  : VCMLA_ROTS<"h", "uint32x2_t", "uint32x4_t">;
 }
 let ArchGuard = "defined(__ARM_FEATURE_COMPLEX)" in {
   def VCADD_ROT90   : SInst<"vcadd_rot90", "...", "f">;
   def VCADD_ROT270  : SInst<"vcadd_rot270", "...", "f">;
   def VCADDQ_ROT90  : SInst<"vcaddq_rot90", "QQQ", "f">;
   def VCADDQ_ROT270 : SInst<"vcaddq_rot270", "QQQ", "f">;
+
+  defm VCMLA_F32        : VCMLA_ROTS<"f", "uint64x1_t", "uint64x2_t">;
 }
 let ArchGuard = "defined(__ARM_FEATURE_COMPLEX) && defined(__aarch64__)" in {
   def VCADDQ_ROT90_FP64  : SInst<"vcaddq_rot90", "QQQ", "d">;
   def VCADDQ_ROT270_FP64 : SInst<"vcaddq_rot270", "QQQ", "d">;
+
+  defm VCMLA_FP64 : VCMLA_ROTS<"d", "uint64x2_t", "uint64x2_t">;
 }
 
 // V8.2-A BFloat intrinsics
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 73897a2..db7ae58 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -5548,6 +5548,14 @@ static const ARMVectorIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
   NEONMAP0(vcltzq_v),
   NEONMAP1(vclz_v, ctlz, Add1ArgType),
   NEONMAP1(vclzq_v, ctlz, Add1ArgType),
+  NEONMAP1(vcmla_rot180_v, aarch64_neon_vcmla_rot180, Add1ArgType),
+  NEONMAP1(vcmla_rot270_v, aarch64_neon_vcmla_rot270, Add1ArgType),
+  NEONMAP1(vcmla_rot90_v, aarch64_neon_vcmla_rot90, Add1ArgType),
+  NEONMAP1(vcmla_v, aarch64_neon_vcmla_rot0, Add1ArgType),
+  NEONMAP1(vcmlaq_rot180_v, aarch64_neon_vcmla_rot180, Add1ArgType),
+  NEONMAP1(vcmlaq_rot270_v, aarch64_neon_vcmla_rot270, Add1ArgType),
+  NEONMAP1(vcmlaq_rot90_v, aarch64_neon_vcmla_rot90, Add1ArgType),
+  NEONMAP1(vcmlaq_v, aarch64_neon_vcmla_rot0, Add1ArgType),
   NEONMAP1(vcnt_v, ctpop, Add1ArgType),
   NEONMAP1(vcntq_v, ctpop, Add1ArgType),
   NEONMAP1(vcvt_f16_f32, aarch64_neon_vcvtfp2hf, 0),
diff --git a/clang/test/CodeGen/aarch64-neon-vcmla.c b/clang/test/CodeGen/aarch64-neon-vcmla.c
new file mode 100644
index 0000000..2ecc1d5
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-neon-vcmla.c
@@ -0,0 +1,146 @@
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple arm64-apple-ios -target-feature +neon \
+// RUN:        -target-feature +v8.3a \
+// RUN:        -target-feature +fullfp16 \
+// RUN:        -disable-O0-optnone -emit-llvm -o - %s | opt -S -O1 | FileCheck %s
+#include <arm_neon.h>
+
+// CHECK-LABEL: @test_vcmla_f16(
+// CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot0.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> %rhs)
+// CHECK: ret <4 x half> [[RES]]
+float16x4_t test_vcmla_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
+  return vcmla_f16(acc, lhs, rhs);
+}
+
+// CHECK-LABEL: @test_vcmla_f32(
+// CHECK: [[RES:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot0.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
+// CHECK: ret <2 x float> [[RES]]
+float32x2_t test_vcmla_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
+  return vcmla_f32(acc, lhs, rhs);
+}
+
+// CHECK-LABEL: @test_vcmlaq_f16(
+// CHECK: [[RES:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot0.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> %rhs)
+// CHECK: ret <8 x half> [[RES]]
+float16x8_t test_vcmlaq_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
+  return vcmlaq_f16(acc, lhs, rhs);
+}
+
+// CHECK-LABEL: @test_vcmlaq_f32(
+// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> %rhs)
+// CHECK: ret <4 x float> [[RES]]
+float32x4_t test_vcmlaq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
+  return vcmlaq_f32(acc, lhs, rhs);
+}
+
+// CHECK-LABEL: @test_vcmlaq_f64(
+// CHECK: [[RES:%.*]] = call <2 x double> @llvm.aarch64.neon.vcmla.rot0.v2f64(<2 x double> %acc, <2 x double> %lhs, <2 x double> %rhs)
+// CHECK: ret <2 x double> [[RES]]
+float64x2_t test_vcmlaq_f64(float64x2_t acc, float64x2_t lhs, float64x2_t rhs) {
+  return vcmlaq_f64(acc, lhs, rhs);
+}
+
+// CHECK-LABEL: @test_vcmla_rot90_f16(
+// CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot90.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> %rhs)
+// CHECK: ret <4 x half> [[RES]]
+float16x4_t test_vcmla_rot90_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
+  return vcmla_rot90_f16(acc, lhs, rhs);
+}
+
+// CHECK-LABEL: @test_vcmla_rot90_f32(
+// CHECK: [[RES:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot90.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
+// CHECK: ret <2 x float> [[RES]]
+float32x2_t test_vcmla_rot90_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
+  return vcmla_rot90_f32(acc, lhs, rhs);
+}
+
+// CHECK-LABEL: @test_vcmlaq_rot90_f16(
+// CHECK: [[RES:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot90.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> %rhs)
+// CHECK: ret <8 x half> [[RES]]
+float16x8_t test_vcmlaq_rot90_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
+  return vcmlaq_rot90_f16(acc, lhs, rhs);
+}
+
+// CHECK-LABEL: @test_vcmlaq_rot90_f32(
+// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> %rhs)
+// CHECK: ret <4 x float> [[RES]]
+float32x4_t test_vcmlaq_rot90_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
+  return vcmlaq_rot90_f32(acc, lhs, rhs);
+}
+
+// CHECK-LABEL: @test_vcmlaq_rot90_f64(
+// CHECK: [[RES:%.*]] = call <2 x double> @llvm.aarch64.neon.vcmla.rot90.v2f64(<2 x double> %acc, <2 x double> %lhs, <2 x double> %rhs)
+// CHECK: ret <2 x double> [[RES]]
+float64x2_t test_vcmlaq_rot90_f64(float64x2_t acc, float64x2_t lhs, float64x2_t rhs) {
+  return vcmlaq_rot90_f64(acc, lhs, rhs);
+}
+
+// CHECK-LABEL: @test_vcmla_rot180_f16(
+// CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot180.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> %rhs)
+// CHECK: ret <4 x half> [[RES]]
+float16x4_t test_vcmla_rot180_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
+  return vcmla_rot180_f16(acc, lhs, rhs);
+}
+
+// CHECK-LABEL: @test_vcmla_rot180_f32(
+// CHECK: [[RES:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot180.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
+// CHECK: ret <2 x float> [[RES]]
+float32x2_t test_vcmla_rot180_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
+  return vcmla_rot180_f32(acc, lhs, rhs);
+}
+
+// CHECK-LABEL: @test_vcmlaq_rot180_f16(
+// CHECK: [[RES:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot180.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> %rhs)
+// CHECK: ret <8 x half> [[RES]]
+float16x8_t test_vcmlaq_rot180_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
+  return vcmlaq_rot180_f16(acc, lhs, rhs);
+}
+
+// CHECK-LABEL: @test_vcmlaq_rot180_f32(
+// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot180.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> %rhs)
+// CHECK: ret <4 x float> [[RES]]
+float32x4_t test_vcmlaq_rot180_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
+  return vcmlaq_rot180_f32(acc, lhs, rhs);
+}
+
+// CHECK-LABEL: @test_vcmlaq_rot180_f64(
+// CHECK: [[RES:%.*]] = call <2 x double> @llvm.aarch64.neon.vcmla.rot180.v2f64(<2 x double> %acc, <2 x double> %lhs, <2 x double> %rhs)
+// CHECK: ret <2 x double> [[RES]]
+float64x2_t test_vcmlaq_rot180_f64(float64x2_t acc, float64x2_t lhs, float64x2_t rhs) {
+  return vcmlaq_rot180_f64(acc, lhs, rhs);
+}
+
+// CHECK-LABEL: @test_vcmla_rot270_f16(
+// CHECK: [[RES:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot270.v4f16(<4 x half> %acc, <4 x half> %lhs, <4 x half> %rhs)
+// CHECK: ret <4 x half> [[RES]]
+float16x4_t test_vcmla_rot270_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) {
+  return vcmla_rot270_f16(acc, lhs, rhs);
+}
+
+// CHECK-LABEL: @test_vcmla_rot270_f32(
+// CHECK: [[RES:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot270.v2f32(<2 x float> %acc, <2 x float> %lhs, <2 x float> %rhs)
+// CHECK: ret <2 x float> [[RES]]
+float32x2_t test_vcmla_rot270_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) {
+  return vcmla_rot270_f32(acc, lhs, rhs);
+}
+
+// CHECK-LABEL: @test_vcmlaq_rot270_f16(
+// CHECK: [[RES:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot270.v8f16(<8 x half> %acc, <8 x half> %lhs, <8 x half> %rhs)
+// CHECK: ret <8 x half> [[RES]]
+float16x8_t test_vcmlaq_rot270_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) {
+  return vcmlaq_rot270_f16(acc, lhs, rhs);
+}
+
+// CHECK-LABEL: @test_vcmlaq_rot270_f32(
+// CHECK: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot270.v4f32(<4 x float> %acc, <4 x float> %lhs, <4 x float> %rhs)
+// CHECK: ret <4 x float> [[RES]]
+float32x4_t test_vcmlaq_rot270_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) {
+  return vcmlaq_rot270_f32(acc, lhs, rhs);
+}
+
+// CHECK-LABEL: @test_vcmlaq_rot270_f64(
+// CHECK: [[RES:%.*]] = call <2 x double> @llvm.aarch64.neon.vcmla.rot270.v2f64(<2 x double> %acc, <2 x double> %lhs, <2 x double> %rhs)
+// CHECK: ret <2 x double> [[RES]]
+float64x2_t test_vcmlaq_rot270_f64(float64x2_t acc, float64x2_t lhs, float64x2_t rhs) {
+  return vcmlaq_rot270_f64(acc, lhs, rhs);
+}
-- 
2.7.4