From 307794e156bc21b2f122bf5e7d907299392023c5 Mon Sep 17 00:00:00 2001 From: Bixia Zheng Date: Mon, 26 Mar 2018 22:44:27 -0700 Subject: [PATCH] [XLA:CPU] Allow the shape partition algorithm to partition the most minor dimension. The current shape paritition algorithm does not partition the most minor dimension, because doing so causes dynamic loop bounds for the inner loop and used to prohibit LLVM vectorization. This constraint has been removed with revision 328478 and LLVM can now vectorize loops with dynamic bounds. Allow partitioning the most minor dimension is also necessary to support the parallelization of matrix-vector multiplication. Adjust shape_partition_test to reflect this change in the shape partition algorithm. PiperOrigin-RevId: 190574615 --- .../compiler/xla/service/cpu/shape_partition.cc | 5 +- .../xla/service/cpu/shape_partition_test.cc | 116 +++++++-------------- .../tests/exhaustive_f32_elementwise_op_test.cc | 4 +- 3 files changed, 43 insertions(+), 82 deletions(-) diff --git a/tensorflow/compiler/xla/service/cpu/shape_partition.cc b/tensorflow/compiler/xla/service/cpu/shape_partition.cc index 61b408b..42fe955 100644 --- a/tensorflow/compiler/xla/service/cpu/shape_partition.cc +++ b/tensorflow/compiler/xla/service/cpu/shape_partition.cc @@ -20,12 +20,13 @@ namespace cpu { std::vector ShapePartitionAssigner::Run(int64 target_partition_count) { // Gather outer-most dims where dim_size >= 'target_partition_count'. - // Note: always leave inner-dim static for vectorization/optimizations. + // This may include the inner-dim as LLVM can vectorize loops with dynamic + // bounds. std::vector outer_dims; int64 outer_dim_size = 1; // TODO(b/27458679) Consider reserving enough minor dimensions (based on // target vector register width) to enable vector instructions. - for (int i = shape_.layout().minor_to_major_size() - 1; i >= 1; --i) { + for (int i = shape_.layout().minor_to_major_size() - 1; i >= 0; --i) { const int64 dimension = shape_.layout().minor_to_major(i); outer_dims.push_back(dimension); outer_dim_size *= shape_.dimensions(dimension); diff --git a/tensorflow/compiler/xla/service/cpu/shape_partition_test.cc b/tensorflow/compiler/xla/service/cpu/shape_partition_test.cc index ee0c53f..ae80a6f 100644 --- a/tensorflow/compiler/xla/service/cpu/shape_partition_test.cc +++ b/tensorflow/compiler/xla/service/cpu/shape_partition_test.cc @@ -30,105 +30,65 @@ class ShapePartitionAssignerTest : public HloTestBase { protected: typedef std::vector Vec; - void RunR2Test(const Shape& shape, const int64 expected_max_partition_count) { + void RunR2Test(const Shape& shape, int64 max_target_partition_count, + const std::vector* expected_partitions) { ShapePartitionAssigner assigner(shape); - // Check all partitions of outer dimension. - for (int64 i = 1; i <= expected_max_partition_count; ++i) { - EXPECT_TRUE(ContainersEqual(Vec({i}), - assigner.Run(/*target_partition_count=*/i))); + // Iterate through 1..max_target_partition_count. + for (int64 i = 1; i <= max_target_partition_count; ++i) { + std::vector actual_partitions = + assigner.Run(/*target_partition_count=*/i); + EXPECT_THAT(actual_partitions, expected_partitions[i - 1]); } - // Check target_partition_count > outer dimension size. - EXPECT_TRUE(ContainersEqual( - Vec({expected_max_partition_count}), - assigner.Run( - /*target_partition_count=*/expected_max_partition_count + 1))); } }; TEST_F(ShapePartitionAssignerTest, Shape13WithLayout10) { - RunR2Test(ShapeUtil::MakeShapeWithLayout(F32, {1, 3}, {1, 0}), 1); + std::vector expected_partitions[] = {{1} /* 1 */, {1, 2} /* 2 */}; + RunR2Test(ShapeUtil::MakeShapeWithLayout(F32, {1, 3}, {1, 0}), 2, + expected_partitions); } TEST_F(ShapePartitionAssignerTest, Shape31WithLayout01) { - RunR2Test(ShapeUtil::MakeShapeWithLayout(F32, {3, 1}, {0, 1}), 1); + std::vector expected_partitions[] = { + {1} /* 1 */, {1, 2} /* 2 */ + }; + RunR2Test(ShapeUtil::MakeShapeWithLayout(F32, {3, 1}, {0, 1}), 2, + expected_partitions); } TEST_F(ShapePartitionAssignerTest, Shape53WithLayout10) { - RunR2Test(ShapeUtil::MakeShapeWithLayout(F32, {5, 3}, {1, 0}), 5); + std::vector expected_partitions[] = {{1} /* 1 */, {2} /* 2 */, + {3} /* 3 */, {4} /* 4 */, + {5} /* 5 */, {3, 2} /* 6 */}; + RunR2Test(ShapeUtil::MakeShapeWithLayout(F32, {5, 3}, {1, 0}), 6, + expected_partitions); } TEST_F(ShapePartitionAssignerTest, Shape53WithLayout01) { - RunR2Test(ShapeUtil::MakeShapeWithLayout(F32, {5, 3}, {0, 1}), 3); + std::vector expected_partitions[] = { + {1} /* 1 */, {2} /* 2 */, {3} /* 3 */, {2, 2} /* 4 */}; + RunR2Test(ShapeUtil::MakeShapeWithLayout(F32, {5, 3}, {0, 1}), 4, + expected_partitions); } TEST_F(ShapePartitionAssignerTest, Shape532WithLayout210) { - Shape shape = ShapeUtil::MakeShapeWithLayout(F32, {5, 3, 2}, {2, 1, 0}); - ShapePartitionAssigner assigner(shape); - - for (int64 i = 1; i <= 5; ++i) { - EXPECT_TRUE(ContainersEqual(Vec({i}), assigner.Run( - /*target_partition_count=*/i))); - } - - EXPECT_TRUE( - ContainersEqual(Vec({3, 2}), assigner.Run(/*target_partition_count=*/6))); - EXPECT_TRUE( - ContainersEqual(Vec({3, 2}), assigner.Run(/*target_partition_count=*/7))); - EXPECT_TRUE( - ContainersEqual(Vec({4, 2}), assigner.Run(/*target_partition_count=*/8))); - EXPECT_TRUE( - ContainersEqual(Vec({3, 3}), assigner.Run(/*target_partition_count=*/9))); - EXPECT_TRUE(ContainersEqual(Vec({3, 3}), - assigner.Run(/*target_partition_count=*/10))); - EXPECT_TRUE(ContainersEqual(Vec({3, 3}), - assigner.Run(/*target_partition_count=*/11))); - EXPECT_TRUE(ContainersEqual(Vec({4, 3}), - assigner.Run(/*target_partition_count=*/12))); - EXPECT_TRUE(ContainersEqual(Vec({4, 3}), - assigner.Run(/*target_partition_count=*/13))); - EXPECT_TRUE(ContainersEqual(Vec({4, 3}), - assigner.Run(/*target_partition_count=*/14))); - EXPECT_TRUE(ContainersEqual(Vec({5, 3}), - assigner.Run(/*target_partition_count=*/15))); - EXPECT_TRUE(ContainersEqual(Vec({5, 3}), - assigner.Run(/*target_partition_count=*/16))); + std::vector expected_partitions[] = { + {1} /* 1 */, {2} /* 2 */, {3} /* 3 */, {4} /* 4 */, + {5} /* 5 */, {3, 2} /* 6 */, {3, 2} /* 7 */, {4, 2} /* 8 */, + {3, 3} /* 9 */, {3, 3} /* 10 */, {3, 3} /* 11 */, {4, 3} /* 12 */, + {4, 3} /* 13 */, {4, 3} /* 14 */, {5, 3} /* 15 */, {4, 2, 2} /* 16 */}; + RunR2Test(ShapeUtil::MakeShapeWithLayout(F32, {5, 3, 2}, {2, 1, 0}), 16, + expected_partitions); } TEST_F(ShapePartitionAssignerTest, Shape532WithLayout201) { - Shape shape = ShapeUtil::MakeShapeWithLayout(F32, {5, 3, 2}, {2, 0, 1}); - ShapePartitionAssigner assigner(shape); - - for (int64 i = 1; i <= 3; ++i) { - EXPECT_TRUE(ContainersEqual(Vec({i}), assigner.Run( - /*target_partition_count=*/i))); - } - - EXPECT_TRUE( - ContainersEqual(Vec({2, 2}), assigner.Run(/*target_partition_count=*/4))); - EXPECT_TRUE( - ContainersEqual(Vec({2, 2}), assigner.Run(/*target_partition_count=*/5))); - EXPECT_TRUE( - ContainersEqual(Vec({3, 2}), assigner.Run(/*target_partition_count=*/6))); - EXPECT_TRUE( - ContainersEqual(Vec({3, 2}), assigner.Run(/*target_partition_count=*/7))); - EXPECT_TRUE( - ContainersEqual(Vec({3, 2}), assigner.Run(/*target_partition_count=*/8))); - EXPECT_TRUE( - ContainersEqual(Vec({3, 3}), assigner.Run(/*target_partition_count=*/9))); - EXPECT_TRUE(ContainersEqual(Vec({3, 3}), - assigner.Run(/*target_partition_count=*/10))); - EXPECT_TRUE(ContainersEqual(Vec({3, 3}), - assigner.Run(/*target_partition_count=*/11))); - EXPECT_TRUE(ContainersEqual(Vec({3, 4}), - assigner.Run(/*target_partition_count=*/12))); - EXPECT_TRUE(ContainersEqual(Vec({3, 4}), - assigner.Run(/*target_partition_count=*/13))); - EXPECT_TRUE(ContainersEqual(Vec({3, 4}), - assigner.Run(/*target_partition_count=*/14))); - EXPECT_TRUE(ContainersEqual(Vec({3, 5}), - assigner.Run(/*target_partition_count=*/15))); - EXPECT_TRUE(ContainersEqual(Vec({3, 5}), - assigner.Run(/*target_partition_count=*/16))); + std::vector expected_partitions[] = { + {1} /* 1 */, {2} /* 2 */, {3} /* 3 */, {2, 2} /* 4 */, + {2, 2} /* 5 */, {3, 2} /* 6 */, {3, 2} /* 7 */, {3, 2} /* 8 */, + {3, 3} /* 9 */, {3, 3} /* 10 */, {3, 3} /* 11 */, {3, 4} /* 12 */, + {3, 4} /* 13 */, {3, 4} /* 14 */, {3, 5} /* 15 */, {3, 2, 2} /* 16 */}; + RunR2Test(ShapeUtil::MakeShapeWithLayout(F32, {5, 3, 2}, {2, 0, 1}), 16, + expected_partitions); } class ShapePartitionIteratorTest : public HloTestBase { diff --git a/tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc b/tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc index 6fe7737..b28fe0c 100644 --- a/tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc +++ b/tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc @@ -71,8 +71,8 @@ XLA_TEST_P(ExhaustiveF32ElementwiseOpTest, LogF32) { #ifdef XLA_TEST_BACKEND_CPU // TODO(b/73141998): The vectorized Log implementation gives results outside // our error spec in this range (these numbers are bitwise representations of - // floats expressed as a zero extended int64): - std::pair known_incorrect_range = {1, 8315654}; + // floats expressed as a zero extended int64). + std::pair known_incorrect_range = {1, 8388608}; #else std::pair known_incorrect_range = {0, 0}; #endif -- 2.7.4