[ARM] Fix Crashes in fp16/bf16 Inline Asm
authorArchibald Elliott <archibald.elliott@arm.com>
Thu, 13 Apr 2023 10:51:59 +0000 (11:51 +0100)
committerArchibald Elliott <archibald.elliott@arm.com>
Thu, 13 Apr 2023 14:34:04 +0000 (15:34 +0100)
We were still seeing occasional crashes with inline assembly blocks
using fp16/bf16 after my previous patches:
- https://reviews.llvm.org/rGff4027d152d0
- https://reviews.llvm.org/rG7d15212b8c0c
- https://reviews.llvm.org/rG20b2d11896d9

It turns out:
- The original two commits were wrong, and we should have always been
  choosing the SPR register class, not the HPR register class, so that
  LLVM's SelectionDAGBuilder correctly did the right splits/joins.
- The `splitValueIntoRegisterParts`/`joinRegisterPartsIntoValue` changes
  from rG20b2d11896d9 are still correct, even though they sometimes
  result in inefficient codegen of casts between fp16/bf16 and i32/f32
  (which is visible in these tests).

This patch fixes crashes in `getCopyToParts` and when trying to select
`(bf16 (bitconvert (fp16 ...)))` dags when Neon is enabled.

This patch also adds support for passing fp16/bf16 values using the 'x'
constraint that is LLVM-specific. This should broadly match how we pass
with 't' and 'w', but with a different set of valid S registers.

Differential Revision: https://reviews.llvm.org/D147715

llvm/lib/Target/ARM/ARMISelLowering.cpp
llvm/test/CodeGen/ARM/inlineasm-fp-half.ll

index a6b9259..3551287 100644 (file)
@@ -20347,13 +20347,7 @@ RCPair ARMTargetLowering::getRegForInlineAsmConstraint(
     case 'w':
       if (VT == MVT::Other)
         break;
-      if (VT == MVT::f16)
-        return RCPair(0U, Subtarget->hasFullFP16() ? &ARM::HPRRegClass
-                                                   : &ARM::SPRRegClass);
-      if (VT == MVT::bf16)
-        return RCPair(0U, Subtarget->hasBF16() ? &ARM::HPRRegClass
-                                               : &ARM::SPRRegClass);
-      if (VT == MVT::f32)
+      if (VT == MVT::f32 || VT == MVT::f16 || VT == MVT::bf16)
         return RCPair(0U, &ARM::SPRRegClass);
       if (VT.getSizeInBits() == 64)
         return RCPair(0U, &ARM::DPRRegClass);
@@ -20363,7 +20357,7 @@ RCPair ARMTargetLowering::getRegForInlineAsmConstraint(
     case 'x':
       if (VT == MVT::Other)
         break;
-      if (VT == MVT::f32)
+      if (VT == MVT::f32 || VT == MVT::f16 || VT == MVT::bf16)
         return RCPair(0U, &ARM::SPR_8RegClass);
       if (VT.getSizeInBits() == 64)
         return RCPair(0U, &ARM::DPR_8RegClass);
@@ -20373,13 +20367,7 @@ RCPair ARMTargetLowering::getRegForInlineAsmConstraint(
     case 't':
       if (VT == MVT::Other)
         break;
-      if (VT == MVT::f16)
-        return RCPair(0U, Subtarget->hasFullFP16() ? &ARM::HPRRegClass
-                                                   : &ARM::SPRRegClass);
-      if (VT == MVT::bf16)
-        return RCPair(0U, Subtarget->hasBF16() ? &ARM::HPRRegClass
-                                               : &ARM::SPRRegClass);
-      if (VT == MVT::f32 || VT == MVT::i32)
+      if (VT == MVT::f32 || VT == MVT::i32 || VT == MVT::f16 || VT == MVT::bf16)
         return RCPair(0U, &ARM::SPRRegClass);
       if (VT.getSizeInBits() == 64)
         return RCPair(0U, &ARM::DPR_VFP2RegClass);
index 9840e3f..554e5ba 100644 (file)
@@ -1,21 +1,34 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+
 ; No FP16/BF16
-; RUN: llc -mtriple=arm-none-eabi -mattr=+armv8.2-a,+fp-armv8,-fullfp16,-bf16,-neon %s -o - | FileCheck %s --check-prefixes=NO-FP16-SOFTFP
-; RUN: llc -mtriple=thumb-none-eabi -mattr=+armv8.2-a,+fp-armv8,-fullfp16,-bf16,-neon %s -o - | FileCheck %s --check-prefixes=NO-FP16-SOFTFP
-; RUN: llc -mtriple=arm-none-eabihf -mattr=+armv8.2-a,+fp-armv8,-fullfp16,-bf16,-neon %s -o - | FileCheck %s --check-prefixes=NO-FP16-HARD
-; RUN: llc -mtriple=thumb-none-eabihf -mattr=+armv8.2-a,+fp-armv8,-fullfp16,-bf16,-neon %s -o - | FileCheck %s --check-prefixes=NO-FP16-HARD
+; RUN: llc -mtriple=arm-none-eabi     -mattr=+armv8-a,+fp-armv8,-fullfp16,-bf16,-neon %s -o - | FileCheck %s --check-prefix=NO-FP16-SOFTFP
+; RUN: llc -mtriple=thumb-none-eabi   -mattr=+armv8-a,+fp-armv8,-fullfp16,-bf16,-neon %s -o - | FileCheck %s --check-prefix=NO-FP16-SOFTFP
+; RUN: llc -mtriple=arm-none-eabi     -mattr=+armv8-a,+fp-armv8,-fullfp16,-bf16,+neon %s -o - | FileCheck %s --check-prefix=NO-FP16-SOFTFP
+; RUN: llc -mtriple=thumb-none-eabi   -mattr=+armv8-a,+fp-armv8,-fullfp16,-bf16,+neon %s -o - | FileCheck %s --check-prefix=NO-FP16-SOFTFP
+; RUN: llc -mtriple=arm-none-eabihf   -mattr=+armv8-a,+fp-armv8,-fullfp16,-bf16,-neon %s -o - | FileCheck %s --check-prefix=NO-FP16-HARD
+; RUN: llc -mtriple=thumb-none-eabihf -mattr=+armv8-a,+fp-armv8,-fullfp16,-bf16,-neon %s -o - | FileCheck %s --check-prefix=NO-FP16-HARD
+; RUN: llc -mtriple=arm-none-eabihf   -mattr=+armv8-a,+fp-armv8,-fullfp16,-bf16,+neon %s -o - | FileCheck %s --check-prefix=NO-FP16-HARD
+; RUN: llc -mtriple=thumb-none-eabihf -mattr=+armv8-a,+fp-armv8,-fullfp16,-bf16,+neon %s -o - | FileCheck %s --check-prefix=NO-FP16-HARD
 
 ; With FP16, Without BF16
-; RUN: llc -mtriple=arm-none-eabi -mattr=+armv8.2-a,+fp-armv8,+fullfp16,-bf16,-neon %s -o - | FileCheck %s --check-prefixes=FP16-SOFTFP
-; RUN: llc -mtriple=thumb-none-eabi -mattr=+armv8.2-a,+fp-armv8,+fullfp16,-bf16,-neon %s -o - | FileCheck %s --check-prefixes=FP16-SOFTFP
-; RUN: llc -mtriple=arm-none-eabihf -mattr=+armv8.2-a,+fp-armv8,+fullfp16,-bf16,-neon %s -o - | FileCheck %s --check-prefixes=FP16-HARD
-; RUN: llc -mtriple=thumb-none-eabihf -mattr=+armv8.2-a,+fp-armv8,+fullfp16,-bf16,-neon %s -o - | FileCheck %s --check-prefixes=FP16-HARD
+; RUN: llc -mtriple=arm-none-eabi     -mattr=+armv8-a,+fp-armv8,+fullfp16,-bf16,-neon %s -o - | FileCheck %s --check-prefix=FP16-SOFTFP
+; RUN: llc -mtriple=thumb-none-eabi   -mattr=+armv8-a,+fp-armv8,+fullfp16,-bf16,-neon %s -o - | FileCheck %s --check-prefix=FP16-SOFTFP
+; RUN: llc -mtriple=arm-none-eabi     -mattr=+armv8-a,+fp-armv8,+fullfp16,-bf16,+neon %s -o - | FileCheck %s --check-prefix=FP16-SOFTFP
+; RUN: llc -mtriple=thumb-none-eabi   -mattr=+armv8-a,+fp-armv8,+fullfp16,-bf16,+neon %s -o - | FileCheck %s --check-prefix=FP16-SOFTFP
+; RUN: llc -mtriple=arm-none-eabihf   -mattr=+armv8-a,+fp-armv8,+fullfp16,-bf16,-neon %s -o - | FileCheck %s --check-prefix=FP16-HARD
+; RUN: llc -mtriple=thumb-none-eabihf -mattr=+armv8-a,+fp-armv8,+fullfp16,-bf16,-neon %s -o - | FileCheck %s --check-prefix=FP16-HARD
+; RUN: llc -mtriple=arm-none-eabihf   -mattr=+armv8-a,+fp-armv8,+fullfp16,-bf16,+neon %s -o - | FileCheck %s --check-prefix=FP16-HARD
+; RUN: llc -mtriple=thumb-none-eabihf -mattr=+armv8-a,+fp-armv8,+fullfp16,-bf16,+neon %s -o - | FileCheck %s --check-prefix=FP16-HARD
 
 ; With FP16/BF16
-; RUN: llc -mtriple=arm-none-eabi -mattr=+armv8.2-a,+fp-armv8,+fullfp16,+bf16,-neon %s -o - | FileCheck %s --check-prefixes=FP16-SOFTFP,BF16-SOFTFP
-; RUN: llc -mtriple=thumb-none-eabi -mattr=+armv8.2-a,+fp-armv8,+fullfp16,+bf16,-neon %s -o - | FileCheck %s --check-prefixes=FP16-SOFTFP,BF16-SOFTFP
-; RUN: llc -mtriple=arm-none-eabihf -mattr=+armv8.2-a,+fp-armv8,+fullfp16,+bf16,-neon %s -o - | FileCheck %s --check-prefixes=FP16-HARD,BF16-HARD
-; RUN: llc -mtriple=thumb-none-eabihf -mattr=+armv8.2-a,+fp-armv8,+fullfp16,+bf16,-neon %s -o - | FileCheck %s --check-prefixes=FP16-HARD,BF16-HARD
+; RUN: llc -mtriple=arm-none-eabi     -mattr=+armv8-a,+fp-armv8,+fullfp16,+bf16,-neon %s -o - | FileCheck %s --check-prefix=BF16-SOFTFP
+; RUN: llc -mtriple=thumb-none-eabi   -mattr=+armv8-a,+fp-armv8,+fullfp16,+bf16,-neon %s -o - | FileCheck %s --check-prefix=BF16-SOFTFP
+; RUN: llc -mtriple=arm-none-eabi     -mattr=+armv8-a,+fp-armv8,+fullfp16,+bf16,+neon %s -o - | FileCheck %s --check-prefix=SIMD-BF16-SOFTFP
+; RUN: llc -mtriple=thumb-none-eabi   -mattr=+armv8-a,+fp-armv8,+fullfp16,+bf16,+neon %s -o - | FileCheck %s --check-prefix=SIMD-BF16-SOFTFP
+; RUN: llc -mtriple=arm-none-eabihf   -mattr=+armv8-a,+fp-armv8,+fullfp16,+bf16,-neon %s -o - | FileCheck %s --check-prefix=BF16-HARD
+; RUN: llc -mtriple=thumb-none-eabihf -mattr=+armv8-a,+fp-armv8,+fullfp16,+bf16,-neon %s -o - | FileCheck %s --check-prefix=BF16-HARD
+; RUN: llc -mtriple=arm-none-eabihf   -mattr=+armv8-a,+fp-armv8,+fullfp16,+bf16,+neon %s -o - | FileCheck %s --check-prefix=SIMD-BF16-HARD
+; RUN: llc -mtriple=thumb-none-eabihf -mattr=+armv8-a,+fp-armv8,+fullfp16,+bf16,+neon %s -o - | FileCheck %s --check-prefix=SIMD-BF16-HARD
 
 ; This test ensures that we can use `w` and `t` constraints to allocate
 ; S-registers for 16-bit FP inputs and outputs for inline assembly, with either
@@ -41,6 +54,8 @@ define half @half_t(half %x) nounwind {
 ; FP16-SOFTFP-LABEL: half_t:
 ; FP16-SOFTFP:       @ %bb.0: @ %entry
 ; FP16-SOFTFP-NEXT:    vmov.f16 s0, r0
+; FP16-SOFTFP-NEXT:    vmov.f16 r0, s0
+; FP16-SOFTFP-NEXT:    vmov s0, r0
 ; FP16-SOFTFP-NEXT:    @APP
 ; FP16-SOFTFP-NEXT:    vmov.f32 s0, s0
 ; FP16-SOFTFP-NEXT:    @NO_APP
@@ -49,10 +64,52 @@ define half @half_t(half %x) nounwind {
 ;
 ; FP16-HARD-LABEL: half_t:
 ; FP16-HARD:       @ %bb.0: @ %entry
+; FP16-HARD-NEXT:    vmov.f16 r0, s0
+; FP16-HARD-NEXT:    vmov s0, r0
 ; FP16-HARD-NEXT:    @APP
 ; FP16-HARD-NEXT:    vmov.f32 s0, s0
 ; FP16-HARD-NEXT:    @NO_APP
 ; FP16-HARD-NEXT:    bx lr
+;
+; BF16-SOFTFP-LABEL: half_t:
+; BF16-SOFTFP:       @ %bb.0: @ %entry
+; BF16-SOFTFP-NEXT:    vmov.f16 s0, r0
+; BF16-SOFTFP-NEXT:    vmov.f16 r0, s0
+; BF16-SOFTFP-NEXT:    vmov s0, r0
+; BF16-SOFTFP-NEXT:    @APP
+; BF16-SOFTFP-NEXT:    vmov.f32 s0, s0
+; BF16-SOFTFP-NEXT:    @NO_APP
+; BF16-SOFTFP-NEXT:    vmov r0, s0
+; BF16-SOFTFP-NEXT:    bx lr
+;
+; SIMD-BF16-SOFTFP-LABEL: half_t:
+; SIMD-BF16-SOFTFP:       @ %bb.0: @ %entry
+; SIMD-BF16-SOFTFP-NEXT:    vmov.f16 s0, r0
+; SIMD-BF16-SOFTFP-NEXT:    vmov.f16 r0, s0
+; SIMD-BF16-SOFTFP-NEXT:    vmov s0, r0
+; SIMD-BF16-SOFTFP-NEXT:    @APP
+; SIMD-BF16-SOFTFP-NEXT:    vmov.f32 s0, s0
+; SIMD-BF16-SOFTFP-NEXT:    @NO_APP
+; SIMD-BF16-SOFTFP-NEXT:    vmov r0, s0
+; SIMD-BF16-SOFTFP-NEXT:    bx lr
+;
+; BF16-HARD-LABEL: half_t:
+; BF16-HARD:       @ %bb.0: @ %entry
+; BF16-HARD-NEXT:    vmov.f16 r0, s0
+; BF16-HARD-NEXT:    vmov s0, r0
+; BF16-HARD-NEXT:    @APP
+; BF16-HARD-NEXT:    vmov.f32 s0, s0
+; BF16-HARD-NEXT:    @NO_APP
+; BF16-HARD-NEXT:    bx lr
+;
+; SIMD-BF16-HARD-LABEL: half_t:
+; SIMD-BF16-HARD:       @ %bb.0: @ %entry
+; SIMD-BF16-HARD-NEXT:    vmov.f16 r0, s0
+; SIMD-BF16-HARD-NEXT:    vmov s0, r0
+; SIMD-BF16-HARD-NEXT:    @APP
+; SIMD-BF16-HARD-NEXT:    vmov.f32 s0, s0
+; SIMD-BF16-HARD-NEXT:    @NO_APP
+; SIMD-BF16-HARD-NEXT:    bx lr
 entry:
   %0 = tail call half asm "vmov $0, $1", "=t,t"(half %x)
   ret half %0
@@ -78,6 +135,8 @@ define half @half_w(half %x) nounwind {
 ; FP16-SOFTFP-LABEL: half_w:
 ; FP16-SOFTFP:       @ %bb.0: @ %entry
 ; FP16-SOFTFP-NEXT:    vmov.f16 s0, r0
+; FP16-SOFTFP-NEXT:    vmov.f16 r0, s0
+; FP16-SOFTFP-NEXT:    vmov s0, r0
 ; FP16-SOFTFP-NEXT:    @APP
 ; FP16-SOFTFP-NEXT:    vmov.f32 s0, s0
 ; FP16-SOFTFP-NEXT:    @NO_APP
@@ -86,15 +145,138 @@ define half @half_w(half %x) nounwind {
 ;
 ; FP16-HARD-LABEL: half_w:
 ; FP16-HARD:       @ %bb.0: @ %entry
+; FP16-HARD-NEXT:    vmov.f16 r0, s0
+; FP16-HARD-NEXT:    vmov s0, r0
 ; FP16-HARD-NEXT:    @APP
 ; FP16-HARD-NEXT:    vmov.f32 s0, s0
 ; FP16-HARD-NEXT:    @NO_APP
 ; FP16-HARD-NEXT:    bx lr
+;
+; BF16-SOFTFP-LABEL: half_w:
+; BF16-SOFTFP:       @ %bb.0: @ %entry
+; BF16-SOFTFP-NEXT:    vmov.f16 s0, r0
+; BF16-SOFTFP-NEXT:    vmov.f16 r0, s0
+; BF16-SOFTFP-NEXT:    vmov s0, r0
+; BF16-SOFTFP-NEXT:    @APP
+; BF16-SOFTFP-NEXT:    vmov.f32 s0, s0
+; BF16-SOFTFP-NEXT:    @NO_APP
+; BF16-SOFTFP-NEXT:    vmov r0, s0
+; BF16-SOFTFP-NEXT:    bx lr
+;
+; SIMD-BF16-SOFTFP-LABEL: half_w:
+; SIMD-BF16-SOFTFP:       @ %bb.0: @ %entry
+; SIMD-BF16-SOFTFP-NEXT:    vmov.f16 s0, r0
+; SIMD-BF16-SOFTFP-NEXT:    vmov.f16 r0, s0
+; SIMD-BF16-SOFTFP-NEXT:    vmov s0, r0
+; SIMD-BF16-SOFTFP-NEXT:    @APP
+; SIMD-BF16-SOFTFP-NEXT:    vmov.f32 s0, s0
+; SIMD-BF16-SOFTFP-NEXT:    @NO_APP
+; SIMD-BF16-SOFTFP-NEXT:    vmov r0, s0
+; SIMD-BF16-SOFTFP-NEXT:    bx lr
+;
+; BF16-HARD-LABEL: half_w:
+; BF16-HARD:       @ %bb.0: @ %entry
+; BF16-HARD-NEXT:    vmov.f16 r0, s0
+; BF16-HARD-NEXT:    vmov s0, r0
+; BF16-HARD-NEXT:    @APP
+; BF16-HARD-NEXT:    vmov.f32 s0, s0
+; BF16-HARD-NEXT:    @NO_APP
+; BF16-HARD-NEXT:    bx lr
+;
+; SIMD-BF16-HARD-LABEL: half_w:
+; SIMD-BF16-HARD:       @ %bb.0: @ %entry
+; SIMD-BF16-HARD-NEXT:    vmov.f16 r0, s0
+; SIMD-BF16-HARD-NEXT:    vmov s0, r0
+; SIMD-BF16-HARD-NEXT:    @APP
+; SIMD-BF16-HARD-NEXT:    vmov.f32 s0, s0
+; SIMD-BF16-HARD-NEXT:    @NO_APP
+; SIMD-BF16-HARD-NEXT:    bx lr
 entry:
   %0 = tail call half asm "vmov $0, $1", "=w,w"(half %x)
   ret half %0
 }
 
+define half @half_x(half %x) nounwind {
+; NO-FP16-SOFTFP-LABEL: half_x:
+; NO-FP16-SOFTFP:       @ %bb.0: @ %entry
+; NO-FP16-SOFTFP-NEXT:    vmov s0, r0
+; NO-FP16-SOFTFP-NEXT:    @APP
+; NO-FP16-SOFTFP-NEXT:    vmov.f32 s0, s0
+; NO-FP16-SOFTFP-NEXT:    @NO_APP
+; NO-FP16-SOFTFP-NEXT:    vmov r0, s0
+; NO-FP16-SOFTFP-NEXT:    bx lr
+;
+; NO-FP16-HARD-LABEL: half_x:
+; NO-FP16-HARD:       @ %bb.0: @ %entry
+; NO-FP16-HARD-NEXT:    @APP
+; NO-FP16-HARD-NEXT:    vmov.f32 s0, s0
+; NO-FP16-HARD-NEXT:    @NO_APP
+; NO-FP16-HARD-NEXT:    bx lr
+;
+; FP16-SOFTFP-LABEL: half_x:
+; FP16-SOFTFP:       @ %bb.0: @ %entry
+; FP16-SOFTFP-NEXT:    vmov.f16 s0, r0
+; FP16-SOFTFP-NEXT:    vmov.f16 r0, s0
+; FP16-SOFTFP-NEXT:    vmov s0, r0
+; FP16-SOFTFP-NEXT:    @APP
+; FP16-SOFTFP-NEXT:    vmov.f32 s0, s0
+; FP16-SOFTFP-NEXT:    @NO_APP
+; FP16-SOFTFP-NEXT:    vmov r0, s0
+; FP16-SOFTFP-NEXT:    bx lr
+;
+; FP16-HARD-LABEL: half_x:
+; FP16-HARD:       @ %bb.0: @ %entry
+; FP16-HARD-NEXT:    vmov.f16 r0, s0
+; FP16-HARD-NEXT:    vmov s0, r0
+; FP16-HARD-NEXT:    @APP
+; FP16-HARD-NEXT:    vmov.f32 s0, s0
+; FP16-HARD-NEXT:    @NO_APP
+; FP16-HARD-NEXT:    bx lr
+;
+; BF16-SOFTFP-LABEL: half_x:
+; BF16-SOFTFP:       @ %bb.0: @ %entry
+; BF16-SOFTFP-NEXT:    vmov.f16 s0, r0
+; BF16-SOFTFP-NEXT:    vmov.f16 r0, s0
+; BF16-SOFTFP-NEXT:    vmov s0, r0
+; BF16-SOFTFP-NEXT:    @APP
+; BF16-SOFTFP-NEXT:    vmov.f32 s0, s0
+; BF16-SOFTFP-NEXT:    @NO_APP
+; BF16-SOFTFP-NEXT:    vmov r0, s0
+; BF16-SOFTFP-NEXT:    bx lr
+;
+; SIMD-BF16-SOFTFP-LABEL: half_x:
+; SIMD-BF16-SOFTFP:       @ %bb.0: @ %entry
+; SIMD-BF16-SOFTFP-NEXT:    vmov.f16 s0, r0
+; SIMD-BF16-SOFTFP-NEXT:    vmov.f16 r0, s0
+; SIMD-BF16-SOFTFP-NEXT:    vmov s0, r0
+; SIMD-BF16-SOFTFP-NEXT:    @APP
+; SIMD-BF16-SOFTFP-NEXT:    vmov.f32 s0, s0
+; SIMD-BF16-SOFTFP-NEXT:    @NO_APP
+; SIMD-BF16-SOFTFP-NEXT:    vmov r0, s0
+; SIMD-BF16-SOFTFP-NEXT:    bx lr
+;
+; BF16-HARD-LABEL: half_x:
+; BF16-HARD:       @ %bb.0: @ %entry
+; BF16-HARD-NEXT:    vmov.f16 r0, s0
+; BF16-HARD-NEXT:    vmov s0, r0
+; BF16-HARD-NEXT:    @APP
+; BF16-HARD-NEXT:    vmov.f32 s0, s0
+; BF16-HARD-NEXT:    @NO_APP
+; BF16-HARD-NEXT:    bx lr
+;
+; SIMD-BF16-HARD-LABEL: half_x:
+; SIMD-BF16-HARD:       @ %bb.0: @ %entry
+; SIMD-BF16-HARD-NEXT:    vmov.f16 r0, s0
+; SIMD-BF16-HARD-NEXT:    vmov s0, r0
+; SIMD-BF16-HARD-NEXT:    @APP
+; SIMD-BF16-HARD-NEXT:    vmov.f32 s0, s0
+; SIMD-BF16-HARD-NEXT:    @NO_APP
+; SIMD-BF16-HARD-NEXT:    bx lr
+entry:
+  %0 = tail call half asm "vmov $0, $1", "=x,x"(half %x)
+  ret half %0
+}
+
 define bfloat @bf16_t(bfloat %x) nounwind {
 ; NO-FP16-SOFTFP-LABEL: bf16_t:
 ; NO-FP16-SOFTFP:       @ %bb.0: @ %entry
@@ -127,6 +309,42 @@ define bfloat @bf16_t(bfloat %x) nounwind {
 ; FP16-HARD-NEXT:    vmov.f32 s0, s0
 ; FP16-HARD-NEXT:    @NO_APP
 ; FP16-HARD-NEXT:    bx lr
+;
+; BF16-SOFTFP-LABEL: bf16_t:
+; BF16-SOFTFP:       @ %bb.0: @ %entry
+; BF16-SOFTFP-NEXT:    vmov s0, r0
+; BF16-SOFTFP-NEXT:    @APP
+; BF16-SOFTFP-NEXT:    vmov.f32 s0, s0
+; BF16-SOFTFP-NEXT:    @NO_APP
+; BF16-SOFTFP-NEXT:    vmov r0, s0
+; BF16-SOFTFP-NEXT:    bx lr
+;
+; SIMD-BF16-SOFTFP-LABEL: bf16_t:
+; SIMD-BF16-SOFTFP:       @ %bb.0: @ %entry
+; SIMD-BF16-SOFTFP-NEXT:    vmov.f16 s0, r0
+; SIMD-BF16-SOFTFP-NEXT:    vmov.f16 r0, s0
+; SIMD-BF16-SOFTFP-NEXT:    vmov s0, r0
+; SIMD-BF16-SOFTFP-NEXT:    @APP
+; SIMD-BF16-SOFTFP-NEXT:    vmov.f32 s0, s0
+; SIMD-BF16-SOFTFP-NEXT:    @NO_APP
+; SIMD-BF16-SOFTFP-NEXT:    vmov r0, s0
+; SIMD-BF16-SOFTFP-NEXT:    bx lr
+;
+; BF16-HARD-LABEL: bf16_t:
+; BF16-HARD:       @ %bb.0: @ %entry
+; BF16-HARD-NEXT:    @APP
+; BF16-HARD-NEXT:    vmov.f32 s0, s0
+; BF16-HARD-NEXT:    @NO_APP
+; BF16-HARD-NEXT:    bx lr
+;
+; SIMD-BF16-HARD-LABEL: bf16_t:
+; SIMD-BF16-HARD:       @ %bb.0: @ %entry
+; SIMD-BF16-HARD-NEXT:    vmov.f16 r0, s0
+; SIMD-BF16-HARD-NEXT:    vmov s0, r0
+; SIMD-BF16-HARD-NEXT:    @APP
+; SIMD-BF16-HARD-NEXT:    vmov.f32 s0, s0
+; SIMD-BF16-HARD-NEXT:    @NO_APP
+; SIMD-BF16-HARD-NEXT:    bx lr
 entry:
   %0 = tail call bfloat asm "vmov $0, $1", "=t,t"(bfloat %x)
   ret bfloat %0
@@ -164,10 +382,116 @@ define bfloat @bf16_w(bfloat %x) nounwind {
 ; FP16-HARD-NEXT:    vmov.f32 s0, s0
 ; FP16-HARD-NEXT:    @NO_APP
 ; FP16-HARD-NEXT:    bx lr
+;
+; BF16-SOFTFP-LABEL: bf16_w:
+; BF16-SOFTFP:       @ %bb.0: @ %entry
+; BF16-SOFTFP-NEXT:    vmov s0, r0
+; BF16-SOFTFP-NEXT:    @APP
+; BF16-SOFTFP-NEXT:    vmov.f32 s0, s0
+; BF16-SOFTFP-NEXT:    @NO_APP
+; BF16-SOFTFP-NEXT:    vmov r0, s0
+; BF16-SOFTFP-NEXT:    bx lr
+;
+; SIMD-BF16-SOFTFP-LABEL: bf16_w:
+; SIMD-BF16-SOFTFP:       @ %bb.0: @ %entry
+; SIMD-BF16-SOFTFP-NEXT:    vmov.f16 s0, r0
+; SIMD-BF16-SOFTFP-NEXT:    vmov.f16 r0, s0
+; SIMD-BF16-SOFTFP-NEXT:    vmov s0, r0
+; SIMD-BF16-SOFTFP-NEXT:    @APP
+; SIMD-BF16-SOFTFP-NEXT:    vmov.f32 s0, s0
+; SIMD-BF16-SOFTFP-NEXT:    @NO_APP
+; SIMD-BF16-SOFTFP-NEXT:    vmov r0, s0
+; SIMD-BF16-SOFTFP-NEXT:    bx lr
+;
+; BF16-HARD-LABEL: bf16_w:
+; BF16-HARD:       @ %bb.0: @ %entry
+; BF16-HARD-NEXT:    @APP
+; BF16-HARD-NEXT:    vmov.f32 s0, s0
+; BF16-HARD-NEXT:    @NO_APP
+; BF16-HARD-NEXT:    bx lr
+;
+; SIMD-BF16-HARD-LABEL: bf16_w:
+; SIMD-BF16-HARD:       @ %bb.0: @ %entry
+; SIMD-BF16-HARD-NEXT:    vmov.f16 r0, s0
+; SIMD-BF16-HARD-NEXT:    vmov s0, r0
+; SIMD-BF16-HARD-NEXT:    @APP
+; SIMD-BF16-HARD-NEXT:    vmov.f32 s0, s0
+; SIMD-BF16-HARD-NEXT:    @NO_APP
+; SIMD-BF16-HARD-NEXT:    bx lr
 entry:
   %0 = tail call bfloat asm "vmov $0, $1", "=w,w"(bfloat %x)
   ret bfloat %0
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; BF16-HARD: {{.*}}
-; BF16-SOFTFP: {{.*}}
+
+define bfloat @bf16_x(bfloat %x) nounwind {
+; NO-FP16-SOFTFP-LABEL: bf16_x:
+; NO-FP16-SOFTFP:       @ %bb.0: @ %entry
+; NO-FP16-SOFTFP-NEXT:    vmov s0, r0
+; NO-FP16-SOFTFP-NEXT:    @APP
+; NO-FP16-SOFTFP-NEXT:    vmov.f32 s0, s0
+; NO-FP16-SOFTFP-NEXT:    @NO_APP
+; NO-FP16-SOFTFP-NEXT:    vmov r0, s0
+; NO-FP16-SOFTFP-NEXT:    bx lr
+;
+; NO-FP16-HARD-LABEL: bf16_x:
+; NO-FP16-HARD:       @ %bb.0: @ %entry
+; NO-FP16-HARD-NEXT:    @APP
+; NO-FP16-HARD-NEXT:    vmov.f32 s0, s0
+; NO-FP16-HARD-NEXT:    @NO_APP
+; NO-FP16-HARD-NEXT:    bx lr
+;
+; FP16-SOFTFP-LABEL: bf16_x:
+; FP16-SOFTFP:       @ %bb.0: @ %entry
+; FP16-SOFTFP-NEXT:    vmov s0, r0
+; FP16-SOFTFP-NEXT:    @APP
+; FP16-SOFTFP-NEXT:    vmov.f32 s0, s0
+; FP16-SOFTFP-NEXT:    @NO_APP
+; FP16-SOFTFP-NEXT:    vmov r0, s0
+; FP16-SOFTFP-NEXT:    bx lr
+;
+; FP16-HARD-LABEL: bf16_x:
+; FP16-HARD:       @ %bb.0: @ %entry
+; FP16-HARD-NEXT:    @APP
+; FP16-HARD-NEXT:    vmov.f32 s0, s0
+; FP16-HARD-NEXT:    @NO_APP
+; FP16-HARD-NEXT:    bx lr
+;
+; BF16-SOFTFP-LABEL: bf16_x:
+; BF16-SOFTFP:       @ %bb.0: @ %entry
+; BF16-SOFTFP-NEXT:    vmov s0, r0
+; BF16-SOFTFP-NEXT:    @APP
+; BF16-SOFTFP-NEXT:    vmov.f32 s0, s0
+; BF16-SOFTFP-NEXT:    @NO_APP
+; BF16-SOFTFP-NEXT:    vmov r0, s0
+; BF16-SOFTFP-NEXT:    bx lr
+;
+; SIMD-BF16-SOFTFP-LABEL: bf16_x:
+; SIMD-BF16-SOFTFP:       @ %bb.0: @ %entry
+; SIMD-BF16-SOFTFP-NEXT:    vmov.f16 s0, r0
+; SIMD-BF16-SOFTFP-NEXT:    vmov.f16 r0, s0
+; SIMD-BF16-SOFTFP-NEXT:    vmov s0, r0
+; SIMD-BF16-SOFTFP-NEXT:    @APP
+; SIMD-BF16-SOFTFP-NEXT:    vmov.f32 s0, s0
+; SIMD-BF16-SOFTFP-NEXT:    @NO_APP
+; SIMD-BF16-SOFTFP-NEXT:    vmov r0, s0
+; SIMD-BF16-SOFTFP-NEXT:    bx lr
+;
+; BF16-HARD-LABEL: bf16_x:
+; BF16-HARD:       @ %bb.0: @ %entry
+; BF16-HARD-NEXT:    @APP
+; BF16-HARD-NEXT:    vmov.f32 s0, s0
+; BF16-HARD-NEXT:    @NO_APP
+; BF16-HARD-NEXT:    bx lr
+;
+; SIMD-BF16-HARD-LABEL: bf16_x:
+; SIMD-BF16-HARD:       @ %bb.0: @ %entry
+; SIMD-BF16-HARD-NEXT:    vmov.f16 r0, s0
+; SIMD-BF16-HARD-NEXT:    vmov s0, r0
+; SIMD-BF16-HARD-NEXT:    @APP
+; SIMD-BF16-HARD-NEXT:    vmov.f32 s0, s0
+; SIMD-BF16-HARD-NEXT:    @NO_APP
+; SIMD-BF16-HARD-NEXT:    bx lr
+entry:
+  %0 = tail call bfloat asm "vmov $0, $1", "=x,x"(bfloat %x)
+  ret bfloat %0
+}