From 1c81adf362ec79750850dc5ecb0bf3e60399e54f Mon Sep 17 00:00:00 2001
From: Aart Bik <ajcbik@google.com>
Date: Thu, 12 Dec 2019 14:11:27 -0800
Subject: [PATCH] [VectorOps] Add lowering of vector.shuffle to LLVM IR

For example, a shuffle

%1 = vector.shuffle %arg0, %arg1 [0 : i32, 1 : i32] : vector<2xf32>, vector<2xf32>

becomes a direct LLVM shuffle

0 = llvm.shufflevector %arg0, %arg1 [0 : i32, 1 : i32] : !llvm<"<2 x float>">, !llvm<"<2 x float>">

but

%1 = vector.shuffle %a, %b[1 : i32, 0 : i32, 2: i32] : vector<1x4xf32>, vector<2x4xf32>

becomes the more elaborate (note the index permutation that drives
argument selection for the extract operations)

%0 = llvm.mlir.undef : !llvm<"[3 x <4 x float>]">
%1 = llvm.extractvalue %arg1[0] : !llvm<"[2 x <4 x float>]">
%2 = llvm.insertvalue %1, %0[0] : !llvm<"[3 x <4 x float>]">
%3 = llvm.extractvalue %arg0[0] : !llvm<"[1 x <4 x float>]">
%4 = llvm.insertvalue %3, %2[1] : !llvm<"[3 x <4 x float>]">
%5 = llvm.extractvalue %arg1[1] : !llvm<"[2 x <4 x float>]">
%6 = llvm.insertvalue %5, %4[2] : !llvm<"[3 x <4 x float>]">

PiperOrigin-RevId: 285268164
---
 .../VectorToLLVM/ConvertVectorToLLVM.cpp           | 169 ++++++---
 .../Conversion/VectorToLLVM/vector-to-llvm.mlir    | 380 ++++++++++++---------
 2 files changed, 329 insertions(+), 220 deletions(-)
diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
index 17fb933..d4c27a6 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
@@ -61,6 +61,38 @@ static VectorType reducedVectorTypeBack(VectorType tp) {
   return VectorType::get(tp.getShape().take_back(), tp.getElementType());
 }
 
+// Helper that picks the proper sequence for inserting.
+static Value *insertOne(ConversionPatternRewriter &rewriter,
+                        LLVMTypeConverter &lowering, Location loc, Value *val1,
+                        Value *val2, Type llvmType, int64_t rank, int64_t pos) {
+  if (rank == 1) {
+    auto idxType = rewriter.getIndexType();
+    auto constant = rewriter.create<LLVM::ConstantOp>(
+        loc, lowering.convertType(idxType),
+        rewriter.getIntegerAttr(idxType, pos));
+    return rewriter.create<LLVM::InsertElementOp>(loc, llvmType, val1, val2,
+                                                  constant);
+  }
+  return rewriter.create<LLVM::InsertValueOp>(loc, llvmType, val1, val2,
+                                              rewriter.getI64ArrayAttr(pos));
+}
+
+// Helper that picks the proper sequence for extracting.
+static Value *extractOne(ConversionPatternRewriter &rewriter,
+                         LLVMTypeConverter &lowering, Location loc, Value *val,
+                         Type llvmType, int64_t rank, int64_t pos) {
+  if (rank == 1) {
+    auto idxType = rewriter.getIndexType();
+    auto constant = rewriter.create<LLVM::ConstantOp>(
+        loc, lowering.convertType(idxType),
+        rewriter.getIntegerAttr(idxType, pos));
+    return rewriter.create<LLVM::ExtractElementOp>(loc, llvmType, val,
+                                                   constant);
+  }
+  return rewriter.create<LLVM::ExtractValueOp>(loc, llvmType, val,
+                                               rewriter.getI64ArrayAttr(pos));
+}
+
 class VectorBroadcastOpConversion : public LLVMOpLowering {
 public:
   explicit VectorBroadcastOpConversion(MLIRContext *context,
@@ -77,11 +109,12 @@ public:
       return matchFailure();
     // Rewrite when the full vector type can be lowered (which
     // implies all 'reduced' types can be lowered too).
+    auto adaptor = vector::BroadcastOpOperandAdaptor(operands);
     VectorType srcVectorType =
         broadcastOp.getSourceType().dyn_cast<VectorType>();
     rewriter.replaceOp(
-        op, expandRanks(operands[0],  // source value to be expanded
-                        op->getLoc(), // location of original broadcast
+        op, expandRanks(adaptor.source(), // source value to be expanded
+                        op->getLoc(),     // location of original broadcast
                         srcVectorType, dstVectorType, rewriter));
     return matchSuccess();
   }
@@ -142,7 +175,8 @@ private:
     assert((llvmType != nullptr) && "unlowerable vector type");
     if (rank == 1) {
       Value *undef = rewriter.create<LLVM::UndefOp>(loc, llvmType);
-      Value *expand = insertOne(undef, value, loc, llvmType, rank, 0, rewriter);
+      Value *expand =
+          insertOne(rewriter, lowering, loc, undef, value, llvmType, rank, 0);
       SmallVector<int32_t, 4> zeroValues(dim, 0);
       return rewriter.create<LLVM::ShuffleVectorOp>(
           loc, expand, undef, rewriter.getI32ArrayAttr(zeroValues));
@@ -152,7 +186,8 @@ private:
                     reducedVectorTypeFront(dstVectorType), rewriter);
     Value *result = rewriter.create<LLVM::UndefOp>(loc, llvmType);
     for (int64_t d = 0; d < dim; ++d) {
-      result = insertOne(result, expand, loc, llvmType, rank, d, rewriter);
+      result =
+          insertOne(rewriter, lowering, loc, result, expand, llvmType, rank, d);
     }
     return result;
   }
@@ -182,62 +217,86 @@ private:
     Value *result = rewriter.create<LLVM::UndefOp>(loc, llvmType);
     bool atStretch = dim != srcVectorType.getDimSize(0);
     if (rank == 1) {
+      assert(atStretch);
       Type redLlvmType = lowering.convertType(dstVectorType.getElementType());
-      if (atStretch) {
-        Value *one = extractOne(value, loc, redLlvmType, rank, 0, rewriter);
-        Value *expand =
-            insertOne(result, one, loc, llvmType, rank, 0, rewriter);
-        SmallVector<int32_t, 4> zeroValues(dim, 0);
-        return rewriter.create<LLVM::ShuffleVectorOp>(
-            loc, expand, result, rewriter.getI32ArrayAttr(zeroValues));
-      }
-      for (int64_t d = 0; d < dim; ++d) {
-        Value *one = extractOne(value, loc, redLlvmType, rank, d, rewriter);
-        result = insertOne(result, one, loc, llvmType, rank, d, rewriter);
-      }
-    } else {
-      VectorType redSrcType = reducedVectorTypeFront(srcVectorType);
-      VectorType redDstType = reducedVectorTypeFront(dstVectorType);
-      Type redLlvmType = lowering.convertType(redSrcType);
-      for (int64_t d = 0; d < dim; ++d) {
-        int64_t pos = atStretch ? 0 : d;
-        Value *one = extractOne(value, loc, redLlvmType, rank, pos, rewriter);
-        Value *expand = expandRanks(one, loc, redSrcType, redDstType, rewriter);
-        result = insertOne(result, expand, loc, llvmType, rank, d, rewriter);
-      }
+      Value *one =
+          extractOne(rewriter, lowering, loc, value, redLlvmType, rank, 0);
+      Value *expand =
+          insertOne(rewriter, lowering, loc, result, one, llvmType, rank, 0);
+      SmallVector<int32_t, 4> zeroValues(dim, 0);
+      return rewriter.create<LLVM::ShuffleVectorOp>(
+          loc, expand, result, rewriter.getI32ArrayAttr(zeroValues));
+    }
+    VectorType redSrcType = reducedVectorTypeFront(srcVectorType);
+    VectorType redDstType = reducedVectorTypeFront(dstVectorType);
+    Type redLlvmType = lowering.convertType(redSrcType);
+    for (int64_t d = 0; d < dim; ++d) {
+      int64_t pos = atStretch ? 0 : d;
+      Value *one =
+          extractOne(rewriter, lowering, loc, value, redLlvmType, rank, pos);
+      Value *expand = expandRanks(one, loc, redSrcType, redDstType, rewriter);
+      result =
+          insertOne(rewriter, lowering, loc, result, expand, llvmType, rank, d);
     }
     return result;
   }
+};
 
-  // Picks the proper sequence for inserting.
-  Value *insertOne(Value *val1, Value *val2, Location loc, Type llvmType,
-                   int64_t rank, int64_t pos,
-                   ConversionPatternRewriter &rewriter) const {
-    if (rank == 1) {
-      auto idxType = rewriter.getIndexType();
-      auto constant = rewriter.create<LLVM::ConstantOp>(
-          loc, lowering.convertType(idxType),
-          rewriter.getIntegerAttr(idxType, pos));
-      return rewriter.create<LLVM::InsertElementOp>(loc, llvmType, val1, val2,
-                                                    constant);
+class VectorShuffleOpConversion : public LLVMOpLowering {
+public:
+  explicit VectorShuffleOpConversion(MLIRContext *context,
+                                     LLVMTypeConverter &typeConverter)
+      : LLVMOpLowering(vector::ShuffleOp::getOperationName(), context,
+                       typeConverter) {}
+
+  PatternMatchResult
+  matchAndRewrite(Operation *op, ArrayRef<Value *> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto loc = op->getLoc();
+    auto adaptor = vector::ShuffleOpOperandAdaptor(operands);
+    auto shuffleOp = cast<vector::ShuffleOp>(op);
+    auto v1Type = shuffleOp.getV1VectorType();
+    auto v2Type = shuffleOp.getV2VectorType();
+    auto vectorType = shuffleOp.getVectorType();
+    Type llvmType = lowering.convertType(vectorType);
+    auto maskArrayAttr = shuffleOp.mask();
+
+    // Bail if result type cannot be lowered.
+    if (!llvmType)
+      return matchFailure();
+
+    // Get rank and dimension sizes.
+    int64_t rank = vectorType.getRank();
+    assert(v1Type.getRank() == rank);
+    assert(v2Type.getRank() == rank);
+    int64_t v1Dim = v1Type.getDimSize(0);
+
+    // For rank 1, where both operands have *exactly* the same vector type,
+    // there is direct shuffle support in LLVM. Use it!
+    if (rank == 1 && v1Type == v2Type) {
+      Value *shuffle = rewriter.create<LLVM::ShuffleVectorOp>(
+          loc, adaptor.v1(), adaptor.v2(), maskArrayAttr);
+      rewriter.replaceOp(op, shuffle);
+      return matchSuccess();
     }
-    return rewriter.create<LLVM::InsertValueOp>(loc, llvmType, val1, val2,
-                                                rewriter.getI64ArrayAttr(pos));
-  }
 
-  // Picks the proper sequence for extracting.
-  Value *extractOne(Value *value, Location loc, Type llvmType, int64_t rank,
-                    int64_t pos, ConversionPatternRewriter &rewriter) const {
-    if (rank == 1) {
-      auto idxType = rewriter.getIndexType();
-      auto constant = rewriter.create<LLVM::ConstantOp>(
-          loc, lowering.convertType(idxType),
-          rewriter.getIntegerAttr(idxType, pos));
-      return rewriter.create<LLVM::ExtractElementOp>(loc, llvmType, value,
-                                                     constant);
+    // For all other cases, insert the individual values individually.
+    Value *insert = rewriter.create<LLVM::UndefOp>(loc, llvmType);
+    int64_t insPos = 0;
+    for (auto en : llvm::enumerate(maskArrayAttr)) {
+      int64_t extPos = en.value().cast<IntegerAttr>().getInt();
+      Value *value = adaptor.v1();
+      if (extPos >= v1Dim) {
+        extPos -= v1Dim;
+        value = adaptor.v2();
+      }
+      Value *extract =
+          extractOne(rewriter, lowering, loc, value, llvmType, rank, extPos);
+      insert = insertOne(rewriter, lowering, loc, insert, extract, llvmType,
+                         rank, insPos++);
     }
-    return rewriter.create<LLVM::ExtractValueOp>(loc, llvmType, value,
-                                                 rewriter.getI64ArrayAttr(pos));
+    rewriter.replaceOp(op, insert);
+    return matchSuccess();
   }
 };
 
@@ -506,9 +565,9 @@ public:
 /// Populate the given list with patterns that convert from Vector to LLVM.
 void mlir::populateVectorToLLVMConversionPatterns(
     LLVMTypeConverter &converter, OwningRewritePatternList &patterns) {
-  patterns.insert<VectorBroadcastOpConversion, VectorExtractOpConversion,
-                  VectorInsertOpConversion, VectorOuterProductOpConversion,
-                  VectorTypeCastOpConversion>(
+  patterns.insert<VectorBroadcastOpConversion, VectorShuffleOpConversion,
+                  VectorExtractOpConversion, VectorInsertOpConversion,
+                  VectorOuterProductOpConversion, VectorTypeCastOpConversion>(
       converter.getDialect()->getContext(), converter);
 }
 
diff --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
index 28c21f6..0c4b23f 100644
--- a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
+++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
@@ -4,231 +4,281 @@ func @broadcast_vec1d_from_scalar(%arg0: f32) -> vector<2xf32> {
   %0 = vector.broadcast %arg0 : f32 to vector<2xf32>
   return %0 : vector<2xf32>
 }
-//    CHECK-LABEL: broadcast_vec1d_from_scalar
-//          CHECK:   llvm.mlir.undef : !llvm<"<2 x float>">
-//          CHECK:   llvm.mlir.constant(0 : index) : !llvm.i64
-//          CHECK:   llvm.insertelement {{.*}}, {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<2 x float>">
-//          CHECK:   llvm.shufflevector {{.*}}, {{.*}}[0 : i32, 0 : i32] : !llvm<"<2 x float>">, !llvm<"<2 x float>">
-//          CHECK:   llvm.return {{.*}} : !llvm<"<2 x float>">
+// CHECK-LABEL: broadcast_vec1d_from_scalar
+//       CHECK:   llvm.mlir.undef : !llvm<"<2 x float>">
+//       CHECK:   llvm.mlir.constant(0 : index) : !llvm.i64
+//       CHECK:   llvm.insertelement {{.*}}, {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<2 x float>">
+//       CHECK:   llvm.shufflevector {{.*}}, {{.*}}[0 : i32, 0 : i32] : !llvm<"<2 x float>">, !llvm<"<2 x float>">
+//       CHECK:   llvm.return {{.*}} : !llvm<"<2 x float>">
 
 func @broadcast_vec2d_from_scalar(%arg0: f32) -> vector<2x3xf32> {
   %0 = vector.broadcast %arg0 : f32 to vector<2x3xf32>
   return %0 : vector<2x3xf32>
 }
-//    CHECK-LABEL: broadcast_vec2d_from_scalar
-//          CHECK:   llvm.mlir.undef : !llvm<"<3 x float>">
-//          CHECK:   llvm.mlir.constant(0 : index) : !llvm.i64
-//          CHECK:   llvm.insertelement {{.*}}, {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<3 x float>">
-//          CHECK:   llvm.shufflevector {{.*}}, {{.*}}[0 : i32, 0 : i32, 0 : i32] : !llvm<"<3 x float>">, !llvm<"<3 x float>">
-//          CHECK:   llvm.mlir.undef : !llvm<"[2 x <3 x float>]">
-//          CHECK:   llvm.insertvalue {{.*}}, {{.*}}[0] : !llvm<"[2 x <3 x float>]">
-//          CHECK:   llvm.insertvalue {{.*}}, {{.*}}[1] : !llvm<"[2 x <3 x float>]">
-//          CHECK:   llvm.return {{.*}} : !llvm<"[2 x <3 x float>]">
+// CHECK-LABEL: broadcast_vec2d_from_scalar
+//       CHECK:   llvm.mlir.undef : !llvm<"<3 x float>">
+//       CHECK:   llvm.mlir.constant(0 : index) : !llvm.i64
+//       CHECK:   llvm.insertelement {{.*}}, {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<3 x float>">
+//       CHECK:   llvm.shufflevector {{.*}}, {{.*}}[0 : i32, 0 : i32, 0 : i32] : !llvm<"<3 x float>">, !llvm<"<3 x float>">
+//       CHECK:   llvm.mlir.undef : !llvm<"[2 x <3 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[0] : !llvm<"[2 x <3 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[1] : !llvm<"[2 x <3 x float>]">
+//       CHECK:   llvm.return {{.*}} : !llvm<"[2 x <3 x float>]">
 
 func @broadcast_vec3d_from_scalar(%arg0: f32) -> vector<2x3x4xf32> {
   %0 = vector.broadcast %arg0 : f32 to vector<2x3x4xf32>
   return %0 : vector<2x3x4xf32>
 }
-//    CHECK-LABEL: broadcast_vec3d_from_scalar
-//          CHECK:   llvm.mlir.undef : !llvm<"<4 x float>">
-//          CHECK:   llvm.mlir.constant(0 : index) : !llvm.i64
-//          CHECK:   llvm.insertelement {{.*}}, {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<4 x float>">
-//          CHECK:   llvm.shufflevector {{.*}}, {{.*}} [0 : i32, 0 : i32, 0 : i32, 0 : i32] : !llvm<"<4 x float>">, !llvm<"<4 x float>">
-//          CHECK:   llvm.mlir.undef : !llvm<"[3 x <4 x float>]">
-//          CHECK:   llvm.insertvalue {{.*}}, {{.*}}[0] : !llvm<"[3 x <4 x float>]">
-//          CHECK:   llvm.insertvalue {{.*}}, {{.*}}[1] : !llvm<"[3 x <4 x float>]">
-//          CHECK:   llvm.insertvalue {{.*}}, {{.*}}[2] : !llvm<"[3 x <4 x float>]">
-//          CHECK:   llvm.mlir.undef : !llvm<"[2 x [3 x <4 x float>]]">
-//          CHECK:   llvm.insertvalue {{.*}}, {{.*}}[0] : !llvm<"[2 x [3 x <4 x float>]]">
-//          CHECK:   llvm.insertvalue {{.*}}, {{.*}}[1] : !llvm<"[2 x [3 x <4 x float>]]">
-//          CHECK:   llvm.return {{.*}} : !llvm<"[2 x [3 x <4 x float>]]">
+// CHECK-LABEL: broadcast_vec3d_from_scalar
+//       CHECK:   llvm.mlir.undef : !llvm<"<4 x float>">
+//       CHECK:   llvm.mlir.constant(0 : index) : !llvm.i64
+//       CHECK:   llvm.insertelement {{.*}}, {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<4 x float>">
+//       CHECK:   llvm.shufflevector {{.*}}, {{.*}} [0 : i32, 0 : i32, 0 : i32, 0 : i32] : !llvm<"<4 x float>">, !llvm<"<4 x float>">
+//       CHECK:   llvm.mlir.undef : !llvm<"[3 x <4 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[0] : !llvm<"[3 x <4 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[1] : !llvm<"[3 x <4 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[2] : !llvm<"[3 x <4 x float>]">
+//       CHECK:   llvm.mlir.undef : !llvm<"[2 x [3 x <4 x float>]]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[0] : !llvm<"[2 x [3 x <4 x float>]]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[1] : !llvm<"[2 x [3 x <4 x float>]]">
+//       CHECK:   llvm.return {{.*}} : !llvm<"[2 x [3 x <4 x float>]]">
 
 func @broadcast_vec1d_from_vec1d(%arg0: vector<2xf32>) -> vector<2xf32> {
   %0 = vector.broadcast %arg0 : vector<2xf32> to vector<2xf32>
   return %0 : vector<2xf32>
 }
-//    CHECK-LABEL: broadcast_vec1d_from_vec1d
-//          CHECK:   llvm.return {{.*}} : !llvm<"<2 x float>">
+// CHECK-LABEL: broadcast_vec1d_from_vec1d
+//       CHECK:   llvm.return {{.*}} : !llvm<"<2 x float>">
 
 func @broadcast_vec2d_from_vec1d(%arg0: vector<2xf32>) -> vector<3x2xf32> {
   %0 = vector.broadcast %arg0 : vector<2xf32> to vector<3x2xf32>
   return %0 : vector<3x2xf32>
 }
-//    CHECK-LABEL: broadcast_vec2d_from_vec1d
-//          CHECK:   llvm.mlir.undef : !llvm<"[3 x <2 x float>]">
-//          CHECK:   llvm.insertvalue {{.*}}, {{.*}}[0] : !llvm<"[3 x <2 x float>]">
-//          CHECK:   llvm.insertvalue {{.*}}, {{.*}}[1] : !llvm<"[3 x <2 x float>]">
-//          CHECK:   llvm.insertvalue {{.*}}, {{.*}}[2] : !llvm<"[3 x <2 x float>]">
-//          CHECK:   llvm.return {{.*}} : !llvm<"[3 x <2 x float>]">
+// CHECK-LABEL: broadcast_vec2d_from_vec1d
+//       CHECK:   llvm.mlir.undef : !llvm<"[3 x <2 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[0] : !llvm<"[3 x <2 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[1] : !llvm<"[3 x <2 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[2] : !llvm<"[3 x <2 x float>]">
+//       CHECK:   llvm.return {{.*}} : !llvm<"[3 x <2 x float>]">
 
 func @broadcast_vec3d_from_vec1d(%arg0: vector<2xf32>) -> vector<4x3x2xf32> {
   %0 = vector.broadcast %arg0 : vector<2xf32> to vector<4x3x2xf32>
   return %0 : vector<4x3x2xf32>
 }
-//    CHECK-LABEL: broadcast_vec3d_from_vec1d
-//          CHECK:   llvm.mlir.undef : !llvm<"[3 x <2 x float>]">
-//          CHECK:   llvm.insertvalue {{.*}}, {{.*}}[0] : !llvm<"[3 x <2 x float>]">
-//          CHECK:   llvm.insertvalue {{.*}}, {{.*}}[1] : !llvm<"[3 x <2 x float>]">
-//          CHECK:   llvm.insertvalue {{.*}}, {{.*}}[2] : !llvm<"[3 x <2 x float>]">
-//          CHECK:   llvm.mlir.undef : !llvm<"[4 x [3 x <2 x float>]]">
-//          CHECK:   llvm.insertvalue {{.*}}, {{.*}}[0] : !llvm<"[4 x [3 x <2 x float>]]">
-//          CHECK:   llvm.insertvalue {{.*}}, {{.*}}[1] : !llvm<"[4 x [3 x <2 x float>]]">
-//          CHECK:   llvm.insertvalue {{.*}}, {{.*}}[2] : !llvm<"[4 x [3 x <2 x float>]]">
-//          CHECK:   llvm.insertvalue {{.*}}, {{.*}}[3] : !llvm<"[4 x [3 x <2 x float>]]">
-//          CHECK:   llvm.return {{.*}} : !llvm<"[4 x [3 x <2 x float>]]">
+// CHECK-LABEL: broadcast_vec3d_from_vec1d
+//       CHECK:   llvm.mlir.undef : !llvm<"[3 x <2 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[0] : !llvm<"[3 x <2 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[1] : !llvm<"[3 x <2 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[2] : !llvm<"[3 x <2 x float>]">
+//       CHECK:   llvm.mlir.undef : !llvm<"[4 x [3 x <2 x float>]]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[0] : !llvm<"[4 x [3 x <2 x float>]]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[1] : !llvm<"[4 x [3 x <2 x float>]]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[2] : !llvm<"[4 x [3 x <2 x float>]]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[3] : !llvm<"[4 x [3 x <2 x float>]]">
+//       CHECK:   llvm.return {{.*}} : !llvm<"[4 x [3 x <2 x float>]]">
 
 func @broadcast_vec3d_from_vec2d(%arg0: vector<3x2xf32>) -> vector<4x3x2xf32> {
   %0 = vector.broadcast %arg0 : vector<3x2xf32> to vector<4x3x2xf32>
   return %0 : vector<4x3x2xf32>
 }
-//    CHECK-LABEL: broadcast_vec3d_from_vec2d
-//          CHECK:   llvm.mlir.undef : !llvm<"[4 x [3 x <2 x float>]]">
-//          CHECK:   llvm.insertvalue {{.*}}, {{.*}}[0] : !llvm<"[4 x [3 x <2 x float>]]">
-//          CHECK:   llvm.insertvalue {{.*}}, {{.*}}[1] : !llvm<"[4 x [3 x <2 x float>]]">
-//          CHECK:   llvm.insertvalue {{.*}}, {{.*}}[2] : !llvm<"[4 x [3 x <2 x float>]]">
-//          CHECK:   llvm.insertvalue {{.*}}, {{.*}}[3] : !llvm<"[4 x [3 x <2 x float>]]">
-//          CHECK:   llvm.return {{.*}} : !llvm<"[4 x [3 x <2 x float>]]">
+// CHECK-LABEL: broadcast_vec3d_from_vec2d
+//       CHECK:   llvm.mlir.undef : !llvm<"[4 x [3 x <2 x float>]]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[0] : !llvm<"[4 x [3 x <2 x float>]]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[1] : !llvm<"[4 x [3 x <2 x float>]]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[2] : !llvm<"[4 x [3 x <2 x float>]]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[3] : !llvm<"[4 x [3 x <2 x float>]]">
+//       CHECK:   llvm.return {{.*}} : !llvm<"[4 x [3 x <2 x float>]]">
 
 func @broadcast_stretch(%arg0: vector<1xf32>) -> vector<4xf32> {
   %0 = vector.broadcast %arg0 : vector<1xf32> to vector<4xf32>
   return %0 : vector<4xf32>
 }
-//    CHECK-LABEL: broadcast_stretch
-//          CHECK:   llvm.mlir.undef : !llvm<"<4 x float>">
-//          CHECK:   llvm.mlir.constant(0 : index) : !llvm.i64
-//          CHECK:   llvm.extractelement {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<1 x float>">
-//          CHECK:   llvm.mlir.constant(0 : index) : !llvm.i64
-//          CHECK:   llvm.insertelement {{.*}}, {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<4 x float>">
-//          CHECK:   llvm.shufflevector {{.*}}, {{.*}} [0 : i32, 0 : i32, 0 : i32, 0 : i32] : !llvm<"<4 x float>">, !llvm<"<4 x float>">
-//          CHECK:   llvm.return {{.*}} : !llvm<"<4 x float>">
+// CHECK-LABEL: broadcast_stretch
+//       CHECK:   llvm.mlir.undef : !llvm<"<4 x float>">
+//       CHECK:   llvm.mlir.constant(0 : index) : !llvm.i64
+//       CHECK:   llvm.extractelement {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<1 x float>">
+//       CHECK:   llvm.mlir.constant(0 : index) : !llvm.i64
+//       CHECK:   llvm.insertelement {{.*}}, {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<4 x float>">
+//       CHECK:   llvm.shufflevector {{.*}}, {{.*}} [0 : i32, 0 : i32, 0 : i32, 0 : i32] : !llvm<"<4 x float>">, !llvm<"<4 x float>">
+//       CHECK:   llvm.return {{.*}} : !llvm<"<4 x float>">
 
 func @broadcast_stretch_at_start(%arg0: vector<1x4xf32>) -> vector<3x4xf32> {
   %0 = vector.broadcast %arg0 : vector<1x4xf32> to vector<3x4xf32>
   return %0 : vector<3x4xf32>
 }
-//    CHECK-LABEL: broadcast_stretch_at_start
-//          CHECK:   llvm.mlir.undef : !llvm<"[3 x <4 x float>]">
-//          CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[1 x <4 x float>]">
-//          CHECK:   llvm.insertvalue {{.*}}, {{.*}}[0] : !llvm<"[3 x <4 x float>]">
-//          CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[1 x <4 x float>]">
-//          CHECK:   llvm.insertvalue {{.*}}, {{.*}}[1] : !llvm<"[3 x <4 x float>]">
-//          CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[1 x <4 x float>]">
-//          CHECK:   llvm.insertvalue {{.*}}, {{.*}}[2] : !llvm<"[3 x <4 x float>]">
-//          CHECK:   llvm.return {{.*}} : !llvm<"[3 x <4 x float>]">
+// CHECK-LABEL: broadcast_stretch_at_start
+//       CHECK:   llvm.mlir.undef : !llvm<"[3 x <4 x float>]">
+//       CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[1 x <4 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[0] : !llvm<"[3 x <4 x float>]">
+//       CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[1 x <4 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[1] : !llvm<"[3 x <4 x float>]">
+//       CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[1 x <4 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[2] : !llvm<"[3 x <4 x float>]">
+//       CHECK:   llvm.return {{.*}} : !llvm<"[3 x <4 x float>]">
 
 func @broadcast_stretch_at_end(%arg0: vector<4x1xf32>) -> vector<4x3xf32> {
   %0 = vector.broadcast %arg0 : vector<4x1xf32> to vector<4x3xf32>
   return %0 : vector<4x3xf32>
 }
-//    CHECK-LABEL: broadcast_stretch_at_end
-//          CHECK:   llvm.mlir.undef : !llvm<"[4 x <3 x float>]">
-//          CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[4 x <1 x float>]">
-//          CHECK:   llvm.mlir.undef : !llvm<"<3 x float>">
-//          CHECK:   llvm.mlir.constant(0 : index) : !llvm.i64
-//          CHECK:   llvm.extractelement {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<1 x float>">
-//          CHECK:   llvm.mlir.constant(0 : index) : !llvm.i64
-//          CHECK:   llvm.insertelement {{.*}}, {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<3 x float>">
-//          CHECK:   llvm.shufflevector {{.*}}, {{.*}} [0 : i32, 0 : i32, 0 : i32] : !llvm<"<3 x float>">, !llvm<"<3 x float>">
-//          CHECK:   llvm.insertvalue {{.*}}, {{.*}}[0] : !llvm<"[4 x <3 x float>]">
-//          CHECK:   llvm.extractvalue {{.*}}[1] : !llvm<"[4 x <1 x float>]">
-//          CHECK:   llvm.mlir.undef : !llvm<"<3 x float>">
-//          CHECK:   llvm.mlir.constant(0 : index) : !llvm.i64
-//          CHECK:   llvm.extractelement {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<1 x float>">
-//          CHECK:   llvm.mlir.constant(0 : index) : !llvm.i64
-//          CHECK:   llvm.insertelement {{.*}}, {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<3 x float>">
-//          CHECK:   llvm.shufflevector {{.*}}, {{.*}} [0 : i32, 0 : i32, 0 : i32] : !llvm<"<3 x float>">, !llvm<"<3 x float>">
-//          CHECK:   llvm.insertvalue {{.*}}, {{.*}}[1] : !llvm<"[4 x <3 x float>]">
-//          CHECK:   llvm.extractvalue {{.*}}[2] : !llvm<"[4 x <1 x float>]">
-//          CHECK:   llvm.mlir.undef : !llvm<"<3 x float>">
-//          CHECK:   llvm.mlir.constant(0 : index) : !llvm.i64
-//          CHECK:   llvm.extractelement {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<1 x float>">
-//          CHECK:   llvm.mlir.constant(0 : index) : !llvm.i64
-//          CHECK:   llvm.insertelement {{.*}}, {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<3 x float>">
-//          CHECK:   llvm.shufflevector {{.*}}, {{.*}} [0 : i32, 0 : i32, 0 : i32] : !llvm<"<3 x float>">, !llvm<"<3 x float>">
-//          CHECK:   llvm.insertvalue {{.*}}, {{.*}}[2] : !llvm<"[4 x <3 x float>]">
-//          CHECK:   llvm.extractvalue {{.*}}[3] : !llvm<"[4 x <1 x float>]">
-//          CHECK:   llvm.mlir.undef : !llvm<"<3 x float>">
-//          CHECK:   llvm.mlir.constant(0 : index) : !llvm.i64
-//          CHECK:   llvm.extractelement {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<1 x float>">
-//          CHECK:   llvm.mlir.constant(0 : index) : !llvm.i64
-//          CHECK:   llvm.insertelement {{.*}}, {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<3 x float>">
-//          CHECK:   llvm.shufflevector {{.*}}, {{.*}} [0 : i32, 0 : i32, 0 : i32] : !llvm<"<3 x float>">, !llvm<"<3 x float>">
-//          CHECK:   llvm.insertvalue {{.*}}, {{.*}}[3] : !llvm<"[4 x <3 x float>]">
-//          CHECK:   llvm.return {{.*}} : !llvm<"[4 x <3 x float>]">
+// CHECK-LABEL: broadcast_stretch_at_end
+//       CHECK:   llvm.mlir.undef : !llvm<"[4 x <3 x float>]">
+//       CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[4 x <1 x float>]">
+//       CHECK:   llvm.mlir.undef : !llvm<"<3 x float>">
+//       CHECK:   llvm.mlir.constant(0 : index) : !llvm.i64
+//       CHECK:   llvm.extractelement {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<1 x float>">
+//       CHECK:   llvm.mlir.constant(0 : index) : !llvm.i64
+//       CHECK:   llvm.insertelement {{.*}}, {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<3 x float>">
+//       CHECK:   llvm.shufflevector {{.*}}, {{.*}} [0 : i32, 0 : i32, 0 : i32] : !llvm<"<3 x float>">, !llvm<"<3 x float>">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[0] : !llvm<"[4 x <3 x float>]">
+//       CHECK:   llvm.extractvalue {{.*}}[1] : !llvm<"[4 x <1 x float>]">
+//       CHECK:   llvm.mlir.undef : !llvm<"<3 x float>">
+//       CHECK:   llvm.mlir.constant(0 : index) : !llvm.i64
+//       CHECK:   llvm.extractelement {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<1 x float>">
+//       CHECK:   llvm.mlir.constant(0 : index) : !llvm.i64
+//       CHECK:   llvm.insertelement {{.*}}, {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<3 x float>">
+//       CHECK:   llvm.shufflevector {{.*}}, {{.*}} [0 : i32, 0 : i32, 0 : i32] : !llvm<"<3 x float>">, !llvm<"<3 x float>">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[1] : !llvm<"[4 x <3 x float>]">
+//       CHECK:   llvm.extractvalue {{.*}}[2] : !llvm<"[4 x <1 x float>]">
+//       CHECK:   llvm.mlir.undef : !llvm<"<3 x float>">
+//       CHECK:   llvm.mlir.constant(0 : index) : !llvm.i64
+//       CHECK:   llvm.extractelement {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<1 x float>">
+//       CHECK:   llvm.mlir.constant(0 : index) : !llvm.i64
+//       CHECK:   llvm.insertelement {{.*}}, {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<3 x float>">
+//       CHECK:   llvm.shufflevector {{.*}}, {{.*}} [0 : i32, 0 : i32, 0 : i32] : !llvm<"<3 x float>">, !llvm<"<3 x float>">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[2] : !llvm<"[4 x <3 x float>]">
+//       CHECK:   llvm.extractvalue {{.*}}[3] : !llvm<"[4 x <1 x float>]">
+//       CHECK:   llvm.mlir.undef : !llvm<"<3 x float>">
+//       CHECK:   llvm.mlir.constant(0 : index) : !llvm.i64
+//       CHECK:   llvm.extractelement {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<1 x float>">
+//       CHECK:   llvm.mlir.constant(0 : index) : !llvm.i64
+//       CHECK:   llvm.insertelement {{.*}}, {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<3 x float>">
+//       CHECK:   llvm.shufflevector {{.*}}, {{.*}} [0 : i32, 0 : i32, 0 : i32] : !llvm<"<3 x float>">, !llvm<"<3 x float>">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[3] : !llvm<"[4 x <3 x float>]">
+//       CHECK:   llvm.return {{.*}} : !llvm<"[4 x <3 x float>]">
 
 func @broadcast_stretch_in_middle(%arg0: vector<4x1x2xf32>) -> vector<4x3x2xf32> {
   %0 = vector.broadcast %arg0 : vector<4x1x2xf32> to vector<4x3x2xf32>
   return %0 : vector<4x3x2xf32>
 }
-//    CHECK-LABEL: broadcast_stretch_in_middle
-//          CHECK:   llvm.mlir.undef : !llvm<"[4 x [3 x <2 x float>]]">
-//          CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[4 x [1 x <2 x float>]]">
-//          CHECK:   llvm.mlir.undef : !llvm<"[3 x <2 x float>]">
-//          CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[1 x <2 x float>]">
-//          CHECK:   llvm.insertvalue {{.*}}, {{.*}}[0] : !llvm<"[3 x <2 x float>]">
-//          CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[1 x <2 x float>]">
-//          CHECK:   llvm.insertvalue {{.*}}, {{.*}}[1] : !llvm<"[3 x <2 x float>]">
-//          CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[1 x <2 x float>]">
-//          CHECK:   llvm.insertvalue {{.*}}, {{.*}}[2] : !llvm<"[3 x <2 x float>]">
-//          CHECK:   llvm.insertvalue {{.*}}, {{.*}}[0] : !llvm<"[4 x [3 x <2 x float>]]">
-//          CHECK:   llvm.extractvalue {{.*}}[1] : !llvm<"[4 x [1 x <2 x float>]]">
-//          CHECK:   llvm.mlir.undef : !llvm<"[3 x <2 x float>]">
-//          CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[1 x <2 x float>]">
-//          CHECK:   llvm.insertvalue {{.*}}, {{.*}}[0] : !llvm<"[3 x <2 x float>]">
-//          CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[1 x <2 x float>]">
-//          CHECK:   llvm.insertvalue {{.*}}, {{.*}}[1] : !llvm<"[3 x <2 x float>]">
-//          CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[1 x <2 x float>]">
-//          CHECK:   llvm.insertvalue {{.*}}, {{.*}}[2] : !llvm<"[3 x <2 x float>]">
-//          CHECK:   llvm.insertvalue {{.*}}, {{.*}}[1] : !llvm<"[4 x [3 x <2 x float>]]">
-//          CHECK:   llvm.extractvalue {{.*}}[2] : !llvm<"[4 x [1 x <2 x float>]]">
-//          CHECK:   llvm.mlir.undef : !llvm<"[3 x <2 x float>]">
-//          CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[1 x <2 x float>]">
-//          CHECK:   llvm.insertvalue {{.*}}, {{.*}}[0] : !llvm<"[3 x <2 x float>]">
-//          CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[1 x <2 x float>]">
-//          CHECK:   llvm.insertvalue {{.*}}, {{.*}}[1] : !llvm<"[3 x <2 x float>]">
-//          CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[1 x <2 x float>]">
-//          CHECK:   llvm.insertvalue {{.*}}, {{.*}}[2] : !llvm<"[3 x <2 x float>]">
-//          CHECK:   llvm.insertvalue {{.*}}, {{.*}}[2] : !llvm<"[4 x [3 x <2 x float>]]">
-//          CHECK:   llvm.extractvalue {{.*}}[3] : !llvm<"[4 x [1 x <2 x float>]]">
-//          CHECK:   llvm.mlir.undef : !llvm<"[3 x <2 x float>]">
-//          CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[1 x <2 x float>]">
-//          CHECK:   llvm.insertvalue {{.*}}, {{.*}}[0] : !llvm<"[3 x <2 x float>]">
-//          CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[1 x <2 x float>]">
-//          CHECK:   llvm.insertvalue {{.*}}, {{.*}}[1] : !llvm<"[3 x <2 x float>]">
-//          CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[1 x <2 x float>]">
-//          CHECK:   llvm.insertvalue {{.*}}, {{.*}}[2] : !llvm<"[3 x <2 x float>]">
-//          CHECK:   llvm.insertvalue {{.*}}, {{.*}}[3] : !llvm<"[4 x [3 x <2 x float>]]">
-//          CHECK:   llvm.return {{.*}} : !llvm<"[4 x [3 x <2 x float>]]">
+// CHECK-LABEL: broadcast_stretch_in_middle
+//       CHECK:   llvm.mlir.undef : !llvm<"[4 x [3 x <2 x float>]]">
+//       CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[4 x [1 x <2 x float>]]">
+//       CHECK:   llvm.mlir.undef : !llvm<"[3 x <2 x float>]">
+//       CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[1 x <2 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[0] : !llvm<"[3 x <2 x float>]">
+//       CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[1 x <2 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[1] : !llvm<"[3 x <2 x float>]">
+//       CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[1 x <2 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[2] : !llvm<"[3 x <2 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[0] : !llvm<"[4 x [3 x <2 x float>]]">
+//       CHECK:   llvm.extractvalue {{.*}}[1] : !llvm<"[4 x [1 x <2 x float>]]">
+//       CHECK:   llvm.mlir.undef : !llvm<"[3 x <2 x float>]">
+//       CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[1 x <2 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[0] : !llvm<"[3 x <2 x float>]">
+//       CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[1 x <2 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[1] : !llvm<"[3 x <2 x float>]">
+//       CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[1 x <2 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[2] : !llvm<"[3 x <2 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[1] : !llvm<"[4 x [3 x <2 x float>]]">
+//       CHECK:   llvm.extractvalue {{.*}}[2] : !llvm<"[4 x [1 x <2 x float>]]">
+//       CHECK:   llvm.mlir.undef : !llvm<"[3 x <2 x float>]">
+//       CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[1 x <2 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[0] : !llvm<"[3 x <2 x float>]">
+//       CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[1 x <2 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[1] : !llvm<"[3 x <2 x float>]">
+//       CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[1 x <2 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[2] : !llvm<"[3 x <2 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[2] : !llvm<"[4 x [3 x <2 x float>]]">
+//       CHECK:   llvm.extractvalue {{.*}}[3] : !llvm<"[4 x [1 x <2 x float>]]">
+//       CHECK:   llvm.mlir.undef : !llvm<"[3 x <2 x float>]">
+//       CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[1 x <2 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[0] : !llvm<"[3 x <2 x float>]">
+//       CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[1 x <2 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[1] : !llvm<"[3 x <2 x float>]">
+//       CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[1 x <2 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[2] : !llvm<"[3 x <2 x float>]">
+//       CHECK:   llvm.insertvalue {{.*}}, {{.*}}[3] : !llvm<"[4 x [3 x <2 x float>]]">
+//       CHECK:   llvm.return {{.*}} : !llvm<"[4 x [3 x <2 x float>]]">
 
 func @outerproduct(%arg0: vector<2xf32>, %arg1: vector<3xf32>) -> vector<2x3xf32> {
   %2 = vector.outerproduct %arg0, %arg1 : vector<2xf32>, vector<3xf32>
   return %2 : vector<2x3xf32>
 }
-//    CHECK-LABEL: outerproduct
-//          CHECK:   llvm.mlir.undef : !llvm<"[2 x <3 x float>]">
-//          CHECK:   llvm.shufflevector {{.*}} [0 : i32, 0 : i32, 0 : i32] : !llvm<"<2 x float>">, !llvm<"<2 x float>">
-//          CHECK:   llvm.fmul {{.*}}, {{.*}} : !llvm<"<3 x float>">
-//          CHECK:   llvm.insertvalue {{.*}}[0] : !llvm<"[2 x <3 x float>]">
-//          CHECK:   llvm.shufflevector {{.*}} [1 : i32, 1 : i32, 1 : i32] : !llvm<"<2 x float>">, !llvm<"<2 x float>">
-//          CHECK:   llvm.fmul {{.*}}, {{.*}} : !llvm<"<3 x float>">
-//          CHECK:   llvm.insertvalue {{.*}}[1] : !llvm<"[2 x <3 x float>]">
-//          CHECK:   llvm.return {{.*}} : !llvm<"[2 x <3 x float>]">
+// CHECK-LABEL: outerproduct
+//       CHECK:   llvm.mlir.undef : !llvm<"[2 x <3 x float>]">
+//       CHECK:   llvm.shufflevector {{.*}} [0 : i32, 0 : i32, 0 : i32] : !llvm<"<2 x float>">, !llvm<"<2 x float>">
+//       CHECK:   llvm.fmul {{.*}}, {{.*}} : !llvm<"<3 x float>">
+//       CHECK:   llvm.insertvalue {{.*}}[0] : !llvm<"[2 x <3 x float>]">
+//       CHECK:   llvm.shufflevector {{.*}} [1 : i32, 1 : i32, 1 : i32] : !llvm<"<2 x float>">, !llvm<"<2 x float>">
+//       CHECK:   llvm.fmul {{.*}}, {{.*}} : !llvm<"<3 x float>">
+//       CHECK:   llvm.insertvalue {{.*}}[1] : !llvm<"[2 x <3 x float>]">
+//       CHECK:   llvm.return {{.*}} : !llvm<"[2 x <3 x float>]">
 
 func @outerproduct_add(%arg0: vector<2xf32>, %arg1: vector<3xf32>, %arg2: vector<2x3xf32>) -> vector<2x3xf32> {
   %2 = vector.outerproduct %arg0, %arg1, %arg2 : vector<2xf32>, vector<3xf32>
   return %2 : vector<2x3xf32>
 }
-//    CHECK-LABEL: outerproduct_add
-//          CHECK:   llvm.mlir.undef : !llvm<"[2 x <3 x float>]">
-//          CHECK:   llvm.shufflevector {{.*}} [0 : i32, 0 : i32, 0 : i32] : !llvm<"<2 x float>">, !llvm<"<2 x float>">
-//          CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[2 x <3 x float>]">
-//          CHECK:   "llvm.intr.fmuladd"({{.*}}) : (!llvm<"<3 x float>">, !llvm<"<3 x float>">, !llvm<"<3 x float>">) -> !llvm<"<3 x float>">
-//          CHECK:   llvm.insertvalue {{.*}}[0] : !llvm<"[2 x <3 x float>]">
-//          CHECK:   llvm.shufflevector {{.*}} [1 : i32, 1 : i32, 1 : i32] : !llvm<"<2 x float>">, !llvm<"<2 x float>">
-//          CHECK:   llvm.extractvalue {{.*}}[1] : !llvm<"[2 x <3 x float>]">
-//          CHECK:   "llvm.intr.fmuladd"({{.*}}) : (!llvm<"<3 x float>">, !llvm<"<3 x float>">, !llvm<"<3 x float>">) -> !llvm<"<3 x float>">
-//          CHECK:   llvm.insertvalue {{.*}}[1] : !llvm<"[2 x <3 x float>]">
-//          CHECK:   llvm.return {{.*}} : !llvm<"[2 x <3 x float>]">
+// CHECK-LABEL: outerproduct_add
+//       CHECK:   llvm.mlir.undef : !llvm<"[2 x <3 x float>]">
+//       CHECK:   llvm.shufflevector {{.*}} [0 : i32, 0 : i32, 0 : i32] : !llvm<"<2 x float>">, !llvm<"<2 x float>">
+//       CHECK:   llvm.extractvalue {{.*}}[0] : !llvm<"[2 x <3 x float>]">
+//       CHECK:   "llvm.intr.fmuladd"({{.*}}) : (!llvm<"<3 x float>">, !llvm<"<3 x float>">, !llvm<"<3 x float>">) -> !llvm<"<3 x float>">
+//       CHECK:   llvm.insertvalue {{.*}}[0] : !llvm<"[2 x <3 x float>]">
+//       CHECK:   llvm.shufflevector {{.*}} [1 : i32, 1 : i32, 1 : i32] : !llvm<"<2 x float>">, !llvm<"<2 x float>">
+//       CHECK:   llvm.extractvalue {{.*}}[1] : !llvm<"[2 x <3 x float>]">
+//       CHECK:   "llvm.intr.fmuladd"({{.*}}) : (!llvm<"<3 x float>">, !llvm<"<3 x float>">, !llvm<"<3 x float>">) -> !llvm<"<3 x float>">
+//       CHECK:   llvm.insertvalue {{.*}}[1] : !llvm<"[2 x <3 x float>]">
+//       CHECK:   llvm.return {{.*}} : !llvm<"[2 x <3 x float>]">
+
+func @shuffle_1D_direct(%arg0: vector<2xf32>, %arg1: vector<2xf32>) -> vector<2xf32> {
+  %1 = vector.shuffle %arg0, %arg1 [0 : i32, 1 : i32] : vector<2xf32>, vector<2xf32>
+  return %1 : vector<2xf32>
+}
+// CHECK-LABEL: shuffle_1D_direct(%arg0: !llvm<"<2 x float>">, %arg1: !llvm<"<2 x float>">)
+//       CHECK:   %[[s:.*]] = llvm.shufflevector %arg0, %arg1 [0 : i32, 1 : i32] : !llvm<"<2 x float>">, !llvm<"<2 x float>">
+//       CHECK:   llvm.return %[[s]] : !llvm<"<2 x float>">
+
+func @shuffle_1D(%arg0: vector<2xf32>, %arg1: vector<3xf32>) -> vector<5xf32> {
+  %1 = vector.shuffle %arg0, %arg1 [4 : i32, 3 : i32, 2 : i32, 1 : i32, 0 : i32] : vector<2xf32>, vector<3xf32>
+  return %1 : vector<5xf32>
+}
+// CHECK-LABEL: shuffle_1D(%arg0: !llvm<"<2 x float>">, %arg1: !llvm<"<3 x float>">)
+//       CHECK:   %[[u0:.*]] = llvm.mlir.undef : !llvm<"<5 x float>">
+//       CHECK:   %[[c2:.*]] = llvm.mlir.constant(2 : index) : !llvm.i64
+//       CHECK:   %[[e1:.*]] = llvm.extractelement %arg1[%[[c2]] : !llvm.i64] : !llvm<"<3 x float>">
+//       CHECK:   %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
+//       CHECK:   %[[i1:.*]] = llvm.insertelement %[[e1]], %[[u0]][%[[c0]] : !llvm.i64] : !llvm<"<5 x float>">
+//       CHECK:   %[[c1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
+//       CHECK:   %[[e2:.*]] = llvm.extractelement %arg1[%[[c1]] : !llvm.i64] : !llvm<"<3 x float>">
+//       CHECK:   %[[c1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
+//       CHECK:   %[[i2:.*]] = llvm.insertelement %[[e2]], %[[i1]][%[[c1]] : !llvm.i64] : !llvm<"<5 x float>">
+//       CHECK:   %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
+//       CHECK:   %[[e3:.*]] = llvm.extractelement %arg1[%[[c0]] : !llvm.i64] : !llvm<"<3 x float>">
+//       CHECK:   %[[c2:.*]] = llvm.mlir.constant(2 : index) : !llvm.i64
+//       CHECK:   %[[i3:.*]] = llvm.insertelement %[[e3]], %[[i2]][%[[c2]] : !llvm.i64] : !llvm<"<5 x float>">
+//       CHECK:   %[[c1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
+//       CHECK:   %[[e4:.*]] = llvm.extractelement %arg0[%[[c1]] : !llvm.i64] : !llvm<"<2 x float>">
+//       CHECK:   %[[c3:.*]] = llvm.mlir.constant(3 : index) : !llvm.i64
+//       CHECK:   %[[i4:.*]] = llvm.insertelement %[[e4]], %[[i3]][%[[c3]] : !llvm.i64] : !llvm<"<5 x float>">
+//       CHECK:   %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
+//       CHECK:   %[[e5:.*]] = llvm.extractelement %arg0[%[[c0]] : !llvm.i64] : !llvm<"<2 x float>">
+//       CHECK:   %[[c4:.*]] = llvm.mlir.constant(4 : index) : !llvm.i64
+//       CHECK:   %[[i5:.*]] = llvm.insertelement %[[e5]], %[[i4]][%[[c4]] : !llvm.i64] : !llvm<"<5 x float>">
+//       CHECK:   llvm.return %[[i5]] : !llvm<"<5 x float>">
+
+func @shuffle_2D(%a: vector<1x4xf32>, %b: vector<2x4xf32>) -> vector<3x4xf32> {
+  %1 = vector.shuffle %a, %b[1 : i32, 0 : i32, 2: i32] : vector<1x4xf32>, vector<2x4xf32>
+  return %1 : vector<3x4xf32>
+}
+// CHECK-LABEL: shuffle_2D(%arg0: !llvm<"[1 x <4 x float>]">, %arg1: !llvm<"[2 x <4 x float>]">)
+//       CHECK:   %[[u0:.*]] = llvm.mlir.undef : !llvm<"[3 x <4 x float>]">
+//       CHECK:   %[[e1:.*]] = llvm.extractvalue %arg1[0] : !llvm<"[2 x <4 x float>]">
+//       CHECK:   %[[i1:.*]] = llvm.insertvalue %[[e1]], %[[u0]][0] : !llvm<"[3 x <4 x float>]">
+//       CHECK:   %[[e2:.*]] = llvm.extractvalue %arg0[0] : !llvm<"[1 x <4 x float>]">
+//       CHECK:   %[[i2:.*]] = llvm.insertvalue %[[e2]], %[[i1]][1] : !llvm<"[3 x <4 x float>]">
+//       CHECK:   %[[e3:.*]] = llvm.extractvalue %arg1[1] : !llvm<"[2 x <4 x float>]">
+//       CHECK:   %[[i3:.*]] = llvm.insertvalue %[[e3]], %[[i2]][2] : !llvm<"[3 x <4 x float>]">
+//       CHECK:   llvm.return %[[i3]] : !llvm<"[3 x <4 x float>]">
 
 func @extract_element_from_vec_1d(%arg0: vector<16xf32>) -> f32 {
   %0 = vector.extract %arg0[15 : i32]: vector<16xf32>
-- 
2.7.4