From 2fcee8bd52dd768f108656422a88e5a5fa3ed5b3 Mon Sep 17 00:00:00 2001
From: Oliver Stannard <oliver.stannard@arm.com>
Date: Fri, 27 Apr 2018 14:03:32 +0000
Subject: [PATCH] [ARM,AArch64] Add intrinsics for dot product instructions

The ACLE spec which describes these intrinsics hasn't been published yet, but
this is based on the final draft which will be published soon, and these have
already been implemented by GCC.

Differential revision: https://reviews.llvm.org/D46109

llvm-svn: 331039
---
 clang/include/clang/Basic/arm_neon.td         |  17 ++++
 clang/include/clang/Basic/arm_neon_incl.td    |   3 +
 clang/lib/CodeGen/CGBuiltin.cpp               |  12 +++
 clang/test/CodeGen/aarch64-neon-dot-product.c | 117 ++++++++++++++++++++++++++
 clang/test/CodeGen/arm-neon-dot-product.c     |  76 +++++++++++++++++
 clang/utils/TableGen/NeonEmitter.cpp          |  13 +++
 6 files changed, 238 insertions(+)
 create mode 100644 clang/test/CodeGen/aarch64-neon-dot-product.c
 create mode 100644 clang/test/CodeGen/arm-neon-dot-product.c
diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index 32830b3..fc9b6a4 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -199,6 +199,13 @@ def OP_SCALAR_HALF_SET_LNQ : Op<(bitcast "float16x8_t",
                                           (bitcast "int16_t", $p0),
                                           (bitcast "int16x8_t", $p1), $p2))>;
 
+def OP_DOT_LN
+    : Op<(call "vdot", $p0, $p1,
+          (bitcast $p1, (splat(bitcast "uint32x2_t", $p2), $p3)))>;
+def OP_DOT_LNQ
+    : Op<(call "vdot", $p0, $p1,
+          (bitcast $p1, (splat(bitcast "uint32x4_t", $p2), $p3)))>;
+
 //===----------------------------------------------------------------------===//
 // Instructions
 //===----------------------------------------------------------------------===//
@@ -1579,3 +1586,13 @@ let ArchGuard = "defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(__aarc
   def SCALAR_VDUP_LANEH  : IInst<"vdup_lane", "sdi", "Sh">;
   def SCALAR_VDUP_LANEQH : IInst<"vdup_laneq", "sji", "Sh">;
 }
+
+// v8.2-A dot product instructions.
+let ArchGuard = "defined(__ARM_FEATURE_DOTPROD)" in {
+  def DOT : SInst<"vdot", "dd88", "iQiUiQUi">;
+  def DOT_LANE : SOpInst<"vdot_lane", "dd87i", "iUiQiQUi", OP_DOT_LN>;
+}
+let ArchGuard = "defined(__ARM_FEATURE_DOTPROD) && defined(__aarch64__)" in {
+  // Variants indexing into a 128-bit vector are A64 only.
+  def UDOT_LANEQ : SOpInst<"vdot_laneq", "dd89i", "iUiQiQUi", OP_DOT_LNQ>;
+}
diff --git a/clang/include/clang/Basic/arm_neon_incl.td b/clang/include/clang/Basic/arm_neon_incl.td
index e6a6d25..46708a52 100644
--- a/clang/include/clang/Basic/arm_neon_incl.td
+++ b/clang/include/clang/Basic/arm_neon_incl.td
@@ -253,6 +253,9 @@ def OP_UNAVAILABLE : Operation {
 // B,C,D: array of default elts, force 'Q' size modifier.
 // p: pointer type
 // c: const pointer type
+// 7: vector of 8-bit elements, ignore 'Q' size modifier
+// 8: vector of 8-bit elements, same width as default type
+// 9: vector of 8-bit elements, force 'Q' size modifier
 
 // Every intrinsic subclasses Inst.
 class Inst <string n, string p, string t, Operation o> {
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 3987db9..2740c17 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -3867,6 +3867,8 @@ static const NeonIntrinsicInfo ARMSIMDIntrinsicMap [] = {
   NEONMAP0(vcvtq_u16_v),
   NEONMAP0(vcvtq_u32_v),
   NEONMAP0(vcvtq_u64_v),
+  NEONMAP2(vdot_v, arm_neon_udot, arm_neon_sdot, 0),
+  NEONMAP2(vdotq_v, arm_neon_udot, arm_neon_sdot, 0),
   NEONMAP0(vext_v),
   NEONMAP0(vextq_v),
   NEONMAP0(vfma_v),
@@ -4061,6 +4063,8 @@ static const NeonIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
   NEONMAP1(vcvtq_n_u32_v, aarch64_neon_vcvtfp2fxu, 0),
   NEONMAP1(vcvtq_n_u64_v, aarch64_neon_vcvtfp2fxu, 0),
   NEONMAP1(vcvtx_f32_v, aarch64_neon_fcvtxn, AddRetType | Add1ArgType),
+  NEONMAP2(vdot_v, aarch64_neon_udot, aarch64_neon_sdot, 0),
+  NEONMAP2(vdotq_v, aarch64_neon_udot, aarch64_neon_sdot, 0),
   NEONMAP0(vext_v),
   NEONMAP0(vextq_v),
   NEONMAP0(vfma_v),
@@ -4974,6 +4978,14 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
     }
     return SV;
   }
+  case NEON::BI__builtin_neon_vdot_v:
+  case NEON::BI__builtin_neon_vdotq_v: {
+    llvm::Type *InputTy =
+        llvm::VectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
+    llvm::Type *Tys[2] = { Ty, InputTy };
+    Int = Usgn ? LLVMIntrinsic : AltLLVMIntrinsic;
+    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vdot");
+  }
   }
 
   assert(Int && "Expected valid intrinsic number");
diff --git a/clang/test/CodeGen/aarch64-neon-dot-product.c b/clang/test/CodeGen/aarch64-neon-dot-product.c
new file mode 100644
index 0000000..de18db5
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-neon-dot-product.c
@@ -0,0 +1,117 @@
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +dotprod \
+// RUN: -disable-O0-optnone  -emit-llvm -o - %s | opt -S -instcombine | FileCheck %s
+
+// REQUIRES: aarch64-registered-target
+
+// Test AArch64 Armv8.2-A dot product intrinsics
+
+#include <arm_neon.h>
+
+uint32x2_t test_vdot_u32(uint32x2_t a, uint8x8_t b, uint8x8_t c) {
+// CHECK-LABEL: define <2 x i32> @test_vdot_u32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c)
+// CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c)
+// CHECK: ret <2 x i32> [[RESULT]]
+  return vdot_u32(a, b, c);
+}
+
+uint32x4_t test_vdotq_u32(uint32x4_t a, uint8x16_t b, uint8x16_t c) {
+// CHECK-LABEL: define <4 x i32> @test_vdotq_u32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c)
+// CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c)
+// CHECK: ret <4 x i32> [[RESULT]]
+  return vdotq_u32(a, b, c);
+}
+
+int32x2_t test_vdot_s32(int32x2_t a, int8x8_t b, int8x8_t c) {
+// CHECK-LABEL: define <2 x i32> @test_vdot_s32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c)
+// CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c)
+// CHECK: ret <2 x i32> [[RESULT]]
+  return vdot_s32(a, b, c);
+}
+
+int32x4_t test_vdotq_s32(int32x4_t a, int8x16_t b, int8x16_t c) {
+// CHECK-LABEL: define <4 x i32> @test_vdotq_s32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c)
+// CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c)
+// CHECK: ret <4 x i32> [[RESULT]]
+  return vdotq_s32(a, b, c);
+}
+
+uint32x2_t test_vdot_lane_u32(uint32x2_t a, uint8x8_t b, uint8x8_t c) {
+// CHECK-LABEL: define <2 x i32> @test_vdot_lane_u32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c)
+// CHECK: [[CAST1:%.*]] = bitcast <8 x i8> %c to <2 x i32>
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[CAST1]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+// CHECK: [[CAST2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> [[CAST2]])
+// CHECK: ret <2 x i32> [[RESULT]]
+  return vdot_lane_u32(a, b, c, 1);
+}
+
+uint32x4_t test_vdotq_lane_u32(uint32x4_t a, uint8x16_t b, uint8x8_t c) {
+// CHECK-LABEL: define <4 x i32> @test_vdotq_lane_u32(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c)
+// CHECK: [[CAST1:%.*]] = bitcast <8 x i8> %c to <2 x i32>
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[CAST1]], <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK: [[CAST2:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
+// CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> [[CAST2]])
+// CHECK: ret <4 x i32> [[RESULT]]
+  return vdotq_lane_u32(a, b, c, 1);
+}
+
+uint32x2_t test_vdot_laneq_u32(uint32x2_t a, uint8x8_t b, uint8x16_t c) {
+// CHECK-LABEL: define <2 x i32> @test_vdot_laneq_u32(<2 x i32> %a, <8 x i8> %b, <16 x i8> %c)
+// CHECK: [[CAST1:%.*]] = bitcast <16 x i8> %c to <4 x i32>
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[CAST1]], <4 x i32> undef, <2 x i32> <i32 1, i32 1>
+// CHECK: [[CAST2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> [[CAST2]])
+// CHECK: ret <2 x i32> [[RESULT]]
+  return vdot_laneq_u32(a, b, c, 1);
+}
+
+uint32x4_t test_vdotq_laneq_u32(uint32x4_t a, uint8x16_t b, uint8x16_t c) {
+// CHECK-LABEL: define <4 x i32> @test_vdotq_laneq_u32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c)
+// CHECK: [[CAST1:%.*]] = bitcast <16 x i8> %c to <4 x i32>
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[CAST1]], <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK: [[CAST2:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
+// CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> [[CAST2]])
+// CHECK: ret <4 x i32> [[RESULT]]
+  return vdotq_laneq_u32(a, b, c, 1);
+}
+
+int32x2_t test_vdot_lane_s32(int32x2_t a, int8x8_t b, int8x8_t c) {
+// CHECK-LABEL: define <2 x i32> @test_vdot_lane_s32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c)
+// CHECK: [[CAST1:%.*]] = bitcast <8 x i8> %c to <2 x i32>
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[CAST1]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+// CHECK: [[CAST2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> [[CAST2]])
+// CHECK: ret <2 x i32> [[RESULT]]
+  return vdot_lane_s32(a, b, c, 1);
+}
+
+int32x4_t test_vdotq_lane_s32(int32x4_t a, int8x16_t b, int8x8_t c) {
+// CHECK-LABEL: define <4 x i32> @test_vdotq_lane_s32(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c)
+// CHECK: [[CAST1:%.*]] = bitcast <8 x i8> %c to <2 x i32>
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[CAST1]], <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK: [[CAST2:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
+// CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> [[CAST2]])
+// CHECK: ret <4 x i32> [[RESULT]]
+  return vdotq_lane_s32(a, b, c, 1);
+}
+
+int32x2_t test_vdot_laneq_s32(int32x2_t a, int8x8_t b, int8x16_t c) {
+// CHECK-LABEL: define <2 x i32> @test_vdot_laneq_s32(<2 x i32> %a, <8 x i8> %b, <16 x i8> %c)
+// CHECK: [[CAST1:%.*]] = bitcast <16 x i8> %c to <4 x i32>
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[CAST1]], <4 x i32> undef, <2 x i32> <i32 1, i32 1>
+// CHECK: [[CAST2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> [[CAST2]])
+// CHECK: ret <2 x i32> [[RESULT]]
+  return vdot_laneq_s32(a, b, c, 1);
+}
+
+int32x4_t test_vdotq_laneq_s32(int32x4_t a, int8x16_t b, int8x16_t c) {
+// CHECK-LABEL: define <4 x i32> @test_vdotq_laneq_s32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c)
+// CHECK: [[CAST1:%.*]] = bitcast <16 x i8> %c to <4 x i32>
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[CAST1]], <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK: [[CAST2:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
+// CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> [[CAST2]])
+// CHECK: ret <4 x i32> [[RESULT]]
+  return vdotq_laneq_s32(a, b, c, 1);
+}
+
diff --git a/clang/test/CodeGen/arm-neon-dot-product.c b/clang/test/CodeGen/arm-neon-dot-product.c
new file mode 100644
index 0000000..34acd02
--- /dev/null
+++ b/clang/test/CodeGen/arm-neon-dot-product.c
@@ -0,0 +1,76 @@
+// RUN: %clang_cc1 -triple armv8-linux-gnueabihf -target-cpu cortex-a75 -target-feature +dotprod \
+// RUN: -disable-O0-optnone  -emit-llvm -o - %s | opt -S -instcombine | FileCheck %s
+
+// REQUIRES: arm-registered-target
+
+// Test ARM v8.2-A dot product intrinsics
+
+#include <arm_neon.h>
+
+uint32x2_t test_vdot_u32(uint32x2_t a, uint8x8_t b, uint8x8_t c) {
+// CHECK-LABEL: define <2 x i32> @test_vdot_u32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c)
+// CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.arm.neon.udot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c)
+// CHECK: ret <2 x i32> [[RESULT]]
+  return vdot_u32(a, b, c);
+}
+
+uint32x4_t test_vdotq_u32(uint32x4_t a, uint8x16_t b, uint8x16_t c) {
+// CHECK-LABEL: define <4 x i32> @test_vdotq_u32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c)
+// CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.arm.neon.udot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c)
+// CHECK: ret <4 x i32> [[RESULT]]
+  return vdotq_u32(a, b, c);
+}
+
+int32x2_t test_vdot_s32(int32x2_t a, int8x8_t b, int8x8_t c) {
+// CHECK-LABEL: define <2 x i32> @test_vdot_s32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c)
+// CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.arm.neon.sdot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c)
+// CHECK: ret <2 x i32> [[RESULT]]
+  return vdot_s32(a, b, c);
+}
+
+int32x4_t test_vdotq_s32(int32x4_t a, int8x16_t b, int8x16_t c) {
+// CHECK-LABEL: define <4 x i32> @test_vdotq_s32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c)
+// CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.arm.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c)
+// CHECK: ret <4 x i32> [[RESULT]]
+  return vdotq_s32(a, b, c);
+}
+
+uint32x2_t test_vdot_lane_u32(uint32x2_t a, uint8x8_t b, uint8x8_t c) {
+// CHECK-LABEL: define <2 x i32> @test_vdot_lane_u32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c)
+// CHECK: [[CAST1:%.*]] = bitcast <8 x i8> %c to <2 x i32>
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[CAST1]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+// CHECK: [[CAST2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.arm.neon.udot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> [[CAST2]])
+// CHECK: ret <2 x i32> [[RESULT]]
+  return vdot_lane_u32(a, b, c, 1);
+}
+
+uint32x4_t test_vdotq_lane_u32(uint32x4_t a, uint8x16_t b, uint8x8_t c) {
+// CHECK-LABEL: define <4 x i32> @test_vdotq_lane_u32(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c)
+// CHECK: [[CAST1:%.*]] = bitcast <8 x i8> %c to <2 x i32>
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[CAST1]], <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK: [[CAST2:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
+// CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.arm.neon.udot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> [[CAST2]])
+// CHECK: ret <4 x i32> [[RESULT]]
+  return vdotq_lane_u32(a, b, c, 1);
+}
+
+int32x2_t test_vdot_lane_s32(int32x2_t a, int8x8_t b, int8x8_t c) {
+// CHECK-LABEL: define <2 x i32> @test_vdot_lane_s32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c)
+// CHECK: [[CAST1:%.*]] = bitcast <8 x i8> %c to <2 x i32>
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[CAST1]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+// CHECK: [[CAST2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.arm.neon.sdot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> [[CAST2]])
+// CHECK: ret <2 x i32> [[RESULT]]
+  return vdot_lane_s32(a, b, c, 1);
+}
+
+int32x4_t test_vdotq_lane_s32(int32x4_t a, int8x16_t b, int8x8_t c) {
+// CHECK-LABEL: define <4 x i32> @test_vdotq_lane_s32(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c)
+// CHECK: [[CAST1:%.*]] = bitcast <8 x i8> %c to <2 x i32>
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[CAST1]], <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK: [[CAST2:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
+// CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.arm.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> [[CAST2]])
+// CHECK: ret <4 x i32> [[RESULT]]
+  return vdotq_lane_s32(a, b, c, 1);
+}
diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp
index 62f9740..f8af405 100644
--- a/clang/utils/TableGen/NeonEmitter.cpp
+++ b/clang/utils/TableGen/NeonEmitter.cpp
@@ -995,6 +995,19 @@ void Type::applyModifier(char Mod) {
     if (!AppliedQuad)
       Bitwidth *= 2;
     break;
+  case '7':
+    if (AppliedQuad)
+      Bitwidth /= 2;
+    ElementBitwidth = 8;
+    break;
+  case '8':
+    ElementBitwidth = 8;
+    break;
+  case '9':
+    if (!AppliedQuad)
+      Bitwidth *= 2;
+    ElementBitwidth = 8;
+    break;
   default:
     llvm_unreachable("Unhandled character!");
   }
-- 
2.7.4