From: Viswanath Sivakumar Date: Wed, 12 Dec 2018 23:48:03 +0000 (-0800) Subject: Optimize CPU GenerateProposals op by lazily generating anchors (3-5x faster) (#15103) X-Git-Tag: accepted/tizen/6.5/unified/20211028.231830~2287 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=f34d82700743e2bf6c0d163fc046d1454ccc4c4c;p=platform%2Fupstream%2Fpytorch.git Optimize CPU GenerateProposals op by lazily generating anchors (3-5x faster) (#15103) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/15103 There are two main optimizations in this diff: 1. We generate all anchors for every single spatial grid first, and then apply NMS to pick 2000 anchors according to RPN_PRE_NMS_TOP_N. By first sorting the score and picking the 2000 top ones and then lazily generating only the corresponding anchors is much faster. 2. Transposing bbox_deltas from (num_anchors * 4, H, W) to (H, W, num_anchors * 4) was also quite slow - taking about 20ms in the RRPN case when there are lots of anchors which it's negligible for RPN case (like 0.1 ms). Instead of transponsing, performing all operations in the (num_anchors, H, W) format speeds things up. For regular RPN scenario, this gives 5x speedup from 5.84ms to 1.18ms a case with 35 anchors over a 600x600 image. For rotated boxes with 245 anchors, the runtime down from 80ms to 27ms per iter. Reviewed By: newstzpz Differential Revision: D13428688 fbshipit-source-id: 6006b332925e01a7c9433ded2ff5dc9e6d96f7d3 --- diff --git a/caffe2/operators/generate_proposals_op.cc b/caffe2/operators/generate_proposals_op.cc index 8cee7bc..3f50661 100644 --- a/caffe2/operators/generate_proposals_op.cc +++ b/caffe2/operators/generate_proposals_op.cc @@ -104,12 +104,51 @@ ERMatXf ComputeAllAnchors( return all_anchors_vec; } +ERArrXXf ComputeSortedAnchors( + const Eigen::Map& anchors, + int height, + int width, + float feat_stride, + const vector& order) { + const auto box_dim = anchors.cols(); + CAFFE_ENFORCE(box_dim == 4 || box_dim == 5); + + // Order is flattened in (A, H, W) format. Unravel the indices. + const auto& order_AHW = utils::AsEArrXt(order); + const auto& order_AH = order_AHW / width; + const auto& order_W = order_AHW - order_AH * width; + const auto& order_A = order_AH / height; + const auto& order_H = order_AH - order_A * height; + + // Generate shifts for each location in the H * W grid corresponding + // to the sorted scores in (A, H, W) order. + const auto& shift_x = order_W.cast() * feat_stride; + const auto& shift_y = order_H.cast() * feat_stride; + Eigen::MatrixXf shifts(order.size(), box_dim); + if (box_dim == 4) { + // Upright boxes in [x1, y1, x2, y2] format + shifts << shift_x, shift_y, shift_x, shift_y; + } else { + // Rotated boxes in [ctr_x, ctr_y, w, h, angle] format. + // Zero shift for width, height and angle. + const auto& shift_zero = EArrXf::Constant(order.size(), 0.0); + shifts << shift_x, shift_y, shift_zero, shift_zero, shift_zero; + } + + // Apply shifts to the relevant anchors. + // Equivalent to python code `all_anchors = self._anchors[order_A] + shifts` + ERArrXXf anchors_sorted; + utils::GetSubArrayRows(anchors, order_A, &anchors_sorted); + const auto& all_anchors_sorted = anchors_sorted + shifts.array(); + return all_anchors_sorted; +} + } // namespace utils template <> void GenerateProposalsOp::ProposalsForOneImage( const Eigen::Array3f& im_info, - const Eigen::Map& all_anchors, + const Eigen::Map& anchors, const utils::ConstTensorView& bbox_deltas_tensor, const utils::ConstTensorView& scores_tensor, ERArrXXf* out_boxes, @@ -117,39 +156,23 @@ void GenerateProposalsOp::ProposalsForOneImage( const auto& post_nms_topN = rpn_post_nms_topN_; const auto& nms_thresh = rpn_nms_thresh_; const auto& min_size = rpn_min_size_; - const int box_dim = static_cast(all_anchors.cols()); + const int box_dim = static_cast(anchors.cols()); CAFFE_ENFORCE(box_dim == 4 || box_dim == 5); - // Transpose and reshape predicted bbox transformations to get them - // into the same order as the anchors: - // - bbox deltas will be (box_dim * A, H, W) format from conv output - // - transpose to (H, W, box_dim * A) - // - reshape to (H * W * A, box_dim) where rows are ordered by (H, W, A) - // in slowest to fastest order to match the enumerated anchors CAFFE_ENFORCE_EQ(bbox_deltas_tensor.ndim(), 3); CAFFE_ENFORCE_EQ(bbox_deltas_tensor.dim(0) % box_dim, 0); auto A = bbox_deltas_tensor.dim(0) / box_dim; auto H = bbox_deltas_tensor.dim(1); auto W = bbox_deltas_tensor.dim(2); - // equivalent to python code - // bbox_deltas = bbox_deltas.transpose((1, 2, 0)).reshape((-1, box_dim)) - ERArrXXf bbox_deltas(H * W * A, box_dim); - Eigen::Map(bbox_deltas.data(), H * W, box_dim * A) = - Eigen::Map(bbox_deltas_tensor.data(), A * box_dim, H * W) - .transpose(); - CAFFE_ENFORCE_EQ(bbox_deltas.rows(), all_anchors.rows()); - - // - scores are (A, H, W) format from conv output - // - transpose to (H, W, A) - // - reshape to (H * W * A, 1) where rows are ordered by (H, W, A) - // to match the order of anchors and bbox_deltas + auto K = H * W; + CAFFE_ENFORCE_EQ(A, anchors.rows()); + + // scores are (A, H, W) format from conv output. + // Maintain the same order without transposing (which is slow) + // and compute anchors accordingly. CAFFE_ENFORCE_EQ(scores_tensor.ndim(), 3); CAFFE_ENFORCE_EQ(scores_tensor.dims(), (vector{A, H, W})); - // equivalent to python code - // scores = scores.transpose((1, 2, 0)).reshape((-1, 1)) - EArrXf scores(scores_tensor.size()); - Eigen::Map(scores.data(), H * W, A) = - Eigen::Map(scores_tensor.data(), A, H * W).transpose(); + Eigen::Map scores(scores_tensor.data(), scores_tensor.size()); std::vector order(scores.size()); std::iota(order.begin(), order.end(), 0); @@ -170,15 +193,34 @@ void GenerateProposalsOp::ProposalsForOneImage( order.resize(rpn_pre_nms_topN_); } - ERArrXXf bbox_deltas_sorted; - ERArrXXf all_anchors_sorted; EArrXf scores_sorted; - utils::GetSubArrayRows( - bbox_deltas, utils::AsEArrXt(order), &bbox_deltas_sorted); - utils::GetSubArrayRows( - all_anchors.array(), utils::AsEArrXt(order), &all_anchors_sorted); utils::GetSubArray(scores, utils::AsEArrXt(order), &scores_sorted); + // bbox_deltas are (A * box_dim, H, W) format from conv output. + // Order them based on scores maintaining the same format without + // expensive transpose. + // Note that order corresponds to (A, H * W) in row-major whereas + // bbox_deltas are in (A, box_dim, H * W) in row-major. Hence, we + // obtain a sub-view of bbox_deltas for each dim (4 for RPN, 5 for RRPN) + // in (A, H * W) with an outer stride of box_dim * H * W. Then we apply + // the ordering and filtering for each dim iteratively. + ERArrXXf bbox_deltas_sorted(order.size(), box_dim); + EArrXf bbox_deltas_per_dim(A * K); + EigenOuterStride stride(box_dim * K); + for (int j = 0; j < box_dim; ++j) { + Eigen::Map(bbox_deltas_per_dim.data(), A, K) = + Eigen::Map( + bbox_deltas_tensor.data() + j * K, A, K, stride); + for (int i = 0; i < order.size(); ++i) { + bbox_deltas_sorted(i, j) = bbox_deltas_per_dim[order[i]]; + } + } + + // Compute anchors specific to the ordered and pre-filtered indices + // in (A, H, W) format. + const auto& all_anchors_sorted = + utils::ComputeSortedAnchors(anchors, H, W, feat_stride_, order); + // Transform anchors into proposals via bbox transformations static const std::vector bbox_weights{1.0, 1.0, 1.0, 1.0}; auto proposals = utils::bbox_transform( @@ -220,7 +262,7 @@ bool GenerateProposalsOp::RunOnDevice() { const auto& scores = Input(0); const auto& bbox_deltas = Input(1); const auto& im_info_tensor = Input(2); - const auto& anchors = Input(3); + const auto& anchors_tensor = Input(3); CAFFE_ENFORCE_EQ(scores.dim(), 4, scores.dim()); CAFFE_ENFORCE(scores.template IsType(), scores.dtype().name()); @@ -228,8 +270,7 @@ bool GenerateProposalsOp::RunOnDevice() { const auto A = scores.size(1); const auto height = scores.size(2); const auto width = scores.size(3); - const auto K = height * width; - const auto box_dim = anchors.size(1); + const auto box_dim = anchors_tensor.size(1); CAFFE_ENFORCE(box_dim == 4 || box_dim == 5); // bbox_deltas: (num_images, A * box_dim, H, W) @@ -243,22 +284,19 @@ bool GenerateProposalsOp::RunOnDevice() { im_info_tensor.template IsType(), im_info_tensor.dtype().name()); // anchors: (A, box_dim) - CAFFE_ENFORCE_EQ(anchors.sizes(), (vector{A, box_dim})); - CAFFE_ENFORCE(anchors.template IsType(), anchors.dtype().name()); - - // Broadcast the anchors to all pixels - auto all_anchors_vec = - utils::ComputeAllAnchors(anchors, height, width, feat_stride_); - Eigen::Map all_anchors(all_anchors_vec.data(), K * A, box_dim); + CAFFE_ENFORCE_EQ(anchors_tensor.sizes(), (vector{A, box_dim})); + CAFFE_ENFORCE( + anchors_tensor.template IsType(), anchors_tensor.dtype().name()); Eigen::Map im_info( im_info_tensor.data(), im_info_tensor.size(0), im_info_tensor.size(1)); - const int roi_col_count = box_dim + 1; - auto* out_rois = Output(0, {0, roi_col_count}, at::dtype()); - auto* out_rois_probs = Output(1, {0}, at::dtype()); + Eigen::Map anchors( + anchors_tensor.data(), + anchors_tensor.size(0), + anchors_tensor.size(1)); std::vector im_boxes(num_images); std::vector im_probs(num_images); @@ -271,7 +309,7 @@ bool GenerateProposalsOp::RunOnDevice() { EArrXf& im_i_probs = im_probs[i]; ProposalsForOneImage( cur_im_info, - all_anchors, + anchors, cur_bbox_deltas, cur_scores, &im_i_boxes, @@ -282,8 +320,9 @@ bool GenerateProposalsOp::RunOnDevice() { for (int i = 0; i < num_images; i++) { roi_counts += im_boxes[i].rows(); } - out_rois->Extend(roi_counts, 50); - out_rois_probs->Extend(roi_counts, 50); + const int roi_col_count = box_dim + 1; + auto* out_rois = Output(0, {roi_counts, roi_col_count}, at::dtype()); + auto* out_rois_probs = Output(1, {roi_counts}, at::dtype()); float* out_rois_ptr = out_rois->template mutable_data(); float* out_rois_probs_ptr = out_rois_probs->template mutable_data(); for (int i = 0; i < num_images; i++) { diff --git a/caffe2/operators/generate_proposals_op.h b/caffe2/operators/generate_proposals_op.h index 1d6e28c..fa933e3 100644 --- a/caffe2/operators/generate_proposals_op.h +++ b/caffe2/operators/generate_proposals_op.h @@ -51,6 +51,17 @@ CAFFE2_API ERMatXf ComputeAllAnchors( int width, float feat_stride); +// Like ComputeAllAnchors, but instead of computing anchors for every single +// spatial location, only computes anchors for the already sorted and filtered +// positions after NMS is applied to avoid unnecessary computation. +// `order` is a raveled array of sorted indices in (A, H, W) format. +CAFFE2_API ERArrXXf ComputeSortedAnchors( + const Eigen::Map& anchors, + int height, + int width, + float feat_stride, + const vector& order); + } // namespace utils // C++ implementation of GenerateProposalsOp @@ -101,7 +112,7 @@ class GenerateProposalsOp final : public Operator { // out_probs: n void ProposalsForOneImage( const Eigen::Array3f& im_info, - const Eigen::Map& all_anchors, + const Eigen::Map& anchors, const utils::ConstTensorView& bbox_deltas_tensor, const utils::ConstTensorView& scores_tensor, ERArrXXf* out_boxes, diff --git a/caffe2/operators/generate_proposals_op_test.cc b/caffe2/operators/generate_proposals_op_test.cc index bfe26d0..4d76075 100644 --- a/caffe2/operators/generate_proposals_op_test.cc +++ b/caffe2/operators/generate_proposals_op_test.cc @@ -92,6 +92,56 @@ TEST(GenerateProposalsTest, TestComputeAllAnchors) { EXPECT_EQ((all_anchors_result - all_anchors_gt).norm(), 0); } +TEST(GenerateProposalsTest, TestComputeSortedAnchors) { + ERMatXf anchors(3, 4); + anchors << -38, -16, 53, 31, -84, -40, 99, 55, -176, -88, 191, 103; + + int height = 4; + int width = 3; + int A = anchors.rows(); + float feat_stride = 16; + int total = height * width * A; + + // Generate all anchors for ground truth + Tensor anchors_tensor(vector{anchors.rows(), anchors.cols()}, CPU); + Eigen::Map( + anchors_tensor.mutable_data(), anchors.rows(), anchors.cols()) = + anchors; + auto all_anchors = + utils::ComputeAllAnchors(anchors_tensor, height, width, feat_stride); + Eigen::Map all_anchors_result( + all_anchors.data(), height * width * A, 4); + + Eigen::Map anchors_map( + anchors.data(), anchors.rows(), anchors.cols()); + + // Test with random subsets and ordering of indices + vector indices(total); + std::iota(indices.begin(), indices.end(), 0); + std::random_device rd; + std::mt19937 gen(rd()); + std::shuffle(indices.begin(), indices.end(), gen); + for (int count = 0; count <= total; ++count) { + vector order(indices.begin(), indices.begin() + count); + auto result = utils::ComputeSortedAnchors( + anchors_map, height, width, feat_stride, order); + + // Compare the result of ComputeSortedAnchors with first generating all + // anchors via ComputeAllAnchors and then applying ordering and filtering. + // Need to convert order from (A, H, W) to (H, W, A) format before for this. + const auto& order_AHW = utils::AsEArrXt(order); + const auto& order_AH = order_AHW / width; + const auto& order_W = order_AHW - order_AH * width; + const auto& order_A = order_AH / height; + const auto& order_H = order_AH - order_A * height; + const auto& order_HWA = (order_H * width + order_W) * A + order_A; + + ERArrXXf gt; + utils::GetSubArrayRows(all_anchors_result.array(), order_HWA, >); + EXPECT_EQ((result.matrix() - gt.matrix()).norm(), 0); + } +} + namespace { template @@ -156,6 +206,65 @@ TEST(GenerateProposalsTest, TestComputeAllAnchorsRotated) { EXPECT_EQ((all_anchors_result - all_anchors_gt).norm(), 0); } +TEST(GenerateProposalsTest, TestComputeSortedAnchorsRotated) { + // Similar to TestComputeSortedAnchors but for rotated boxes with angle info. + ERMatXf anchors_xyxy(3, 4); + anchors_xyxy << -38, -16, 53, 31, -84, -40, 99, 55, -176, -88, 191, 103; + + // Convert to RRPN format and add angles + ERMatXf anchors(3, 5); + anchors.block(0, 0, 3, 4) = boxes_xyxy_to_xywh(anchors_xyxy); + std::vector angles{0.0, 45.0, -120.0}; + for (int i = 0; i < anchors.rows(); ++i) { + anchors(i, 4) = angles[i % angles.size()]; + } + + int height = 4; + int width = 3; + int A = anchors.rows(); + float feat_stride = 16; + int total = height * width * A; + + // Generate all anchors for ground truth + Tensor anchors_tensor(vector{anchors.rows(), anchors.cols()}, CPU); + Eigen::Map( + anchors_tensor.mutable_data(), anchors.rows(), anchors.cols()) = + anchors; + auto all_anchors = + utils::ComputeAllAnchors(anchors_tensor, height, width, feat_stride); + Eigen::Map all_anchors_result( + all_anchors.data(), height * width * A, 5); + + Eigen::Map anchors_map( + anchors.data(), anchors.rows(), anchors.cols()); + + // Test with random subsets and ordering of indices + vector indices(total); + std::iota(indices.begin(), indices.end(), 0); + std::random_device rd; + std::mt19937 gen(rd()); + std::shuffle(indices.begin(), indices.end(), gen); + for (int count = 0; count <= total; ++count) { + vector order(indices.begin(), indices.begin() + count); + auto result = utils::ComputeSortedAnchors( + anchors_map, height, width, feat_stride, order); + + // Compare the result of ComputeSortedAnchors with first generating all + // anchors via ComputeAllAnchors and then applying ordering and filtering. + // Need to convert order from (A, H, W) to (H, W, A) format before for this. + const auto& order_AHW = utils::AsEArrXt(order); + const auto& order_AH = order_AHW / width; + const auto& order_W = order_AHW - order_AH * width; + const auto& order_A = order_AH / height; + const auto& order_H = order_AH - order_A * height; + const auto& order_HWA = (order_H * width + order_W) * A + order_A; + + ERArrXXf gt; + utils::GetSubArrayRows(all_anchors_result.array(), order_HWA, >); + EXPECT_EQ((result.matrix() - gt.matrix()).norm(), 0); + } +} + TEST(GenerateProposalsTest, TestEmpty) { Workspace ws; OperatorDef def; @@ -610,11 +719,17 @@ TEST(GenerateProposalsTest, TestRealDownSampledRotated) { EXPECT_NE(nullptr, op.get()); EXPECT_TRUE(op->Run()); - // Verify that the resulting angles are correct Blob* rois_blob = ws.GetBlob("rois"); EXPECT_NE(nullptr, rois_blob); auto& rois = rois_blob->Get(); - EXPECT_GT(rois.size(0), 0); + EXPECT_EQ(rois.sizes(), (vector{13, 6})); + + Blob* rois_probs_blob = ws.GetBlob("rois_probs"); + EXPECT_NE(nullptr, rois_probs_blob); + auto& rois_probs = rois_probs_blob->Get(); + EXPECT_EQ(rois_probs.sizes(), (vector{13})); + + // Verify that the resulting angles are correct auto rois_data = Eigen::Map(rois.data(), rois.size(0), rois.size(1)); for (int i = 0; i < rois.size(0); ++i) {