Implement more accurate penalty & trade-offs while breaking protruding tokens.
authorManuel Klimek <klimek@google.com>
Fri, 17 Nov 2017 11:17:15 +0000 (11:17 +0000)
committerManuel Klimek <klimek@google.com>
Fri, 17 Nov 2017 11:17:15 +0000 (11:17 +0000)
For each line that we break in a protruding token, compute whether the
penalty of breaking is actually larger than the penalty of the excess
characters. Only break if that is the case.

llvm-svn: 318515

clang/lib/Format/BreakableToken.cpp
clang/lib/Format/BreakableToken.h
clang/lib/Format/ContinuationIndenter.cpp
clang/unittests/Format/FormatTest.cpp
clang/unittests/Format/FormatTestComments.cpp

index 67d75fd..d2064ac 100644 (file)
@@ -605,9 +605,9 @@ unsigned BreakableBlockComment::getLineLengthAfterSplitBefore(
   }
 }
 
-bool BreakableBlockComment::introducesBreakBefore(unsigned LineIndex) const {
+bool BreakableBlockComment::introducesBreakBeforeToken() const {
   // A break is introduced when we want delimiters on newline.
-  return LineIndex == 0 && DelimitersOnNewline &&
+  return DelimitersOnNewline &&
          Lines[0].substr(1).find_first_not_of(Blanks) != StringRef::npos;
 }
 
index 8c2dc74..0bfc5ed 100644 (file)
@@ -137,9 +137,9 @@ public:
     return Split(StringRef::npos, 0);
   }
 
-  /// \brief Returns if a break before the content at \p LineIndex will be
-  /// inserted after the whitespace preceding the content has been reformatted.
-  virtual bool introducesBreakBefore(unsigned LineIndex) const {
+  /// \brief Returns whether there will be a line break at the start of the
+  /// token.
+  virtual bool introducesBreakBeforeToken() const {
     return false;
   }
 
@@ -347,7 +347,7 @@ public:
   Split getSplitBefore(unsigned LineIndex, unsigned PreviousEndColumn,
                        unsigned ColumnLimit,
                        llvm::Regex &CommentPragmasRegex) const override;
-  bool introducesBreakBefore(unsigned LineIndex) const override;
+  bool introducesBreakBeforeToken() const override;
   unsigned getLineLengthAfterSplitBefore(unsigned LineIndex,
                                          unsigned TailOffset,
                                          unsigned PreviousEndColumn,
index f171bb5..3fed32f 100644 (file)
@@ -1487,8 +1487,14 @@ unsigned ContinuationIndenter::breakProtrudingToken(const FormatToken &Current,
   if (Current.UnbreakableTailLength >= ColumnLimit)
     return 0;
 
+  unsigned NewBreakPenalty = Current.isStringLiteral()
+                                 ? Style.PenaltyBreakString
+                                 : Style.PenaltyBreakComment;
   unsigned RemainingSpace = ColumnLimit - Current.UnbreakableTailLength;
-  bool BreakInserted = false;
+  bool BreakInserted = Token->introducesBreakBeforeToken();
+  // Store whether we inserted a new line break at the end of the previous
+  // logical line.
+  bool NewBreakBefore = false;
   // We use a conservative reflowing strategy. Reflow starts after a line is
   // broken or the corresponding whitespace compressed. Reflow ends as soon as a
   // line that doesn't get reflown with the previous line is reached.
@@ -1496,23 +1502,50 @@ unsigned ContinuationIndenter::breakProtrudingToken(const FormatToken &Current,
   unsigned Penalty = 0;
   unsigned RemainingTokenColumns = 0;
   unsigned TailOffset = 0;
+  DEBUG(llvm::dbgs() << "Breaking protruding token at column " << StartColumn
+                     << ".\n");
   for (unsigned LineIndex = 0, EndIndex = Token->getLineCount();
        LineIndex != EndIndex; ++LineIndex) {
+    DEBUG(llvm::dbgs() << "  Line: " << LineIndex
+                       << " (Reflow: " << ReflowInProgress << ")\n");
     BreakableToken::Split SplitBefore(StringRef::npos, 0);
     if (ReflowInProgress) {
       SplitBefore = Token->getSplitBefore(LineIndex, RemainingTokenColumns,
                                           RemainingSpace, CommentPragmasRegex);
     }
     ReflowInProgress = SplitBefore.first != StringRef::npos;
+    DEBUG({
+      if (ReflowInProgress)
+        llvm::dbgs() << "  Reflowing.\n";
+    });
     TailOffset =
         ReflowInProgress ? (SplitBefore.first + SplitBefore.second) : 0;
-    BreakInserted = BreakInserted || Token->introducesBreakBefore(LineIndex);
+    // If we found a reflow split and have added a new break before this line,
+    // we are going to remove the line break at the start of the next logical
+    // line.
+    // For example, here we'll add a new line break after 'text', and
+    // subsequently delete the line break between 'that' and 'reflows'.
+    //   // some text that
+    //   // reflows
+    // ->
+    //   // some text
+    //   // that reflows
+    // When adding the line break, we also added the penalty for it, so we need
+    // to subtract that penalty again when we remove the line break due to
+    // reflowing.
+    if (ReflowInProgress && NewBreakBefore) {
+      assert(Penalty >= NewBreakPenalty);
+      Penalty -= NewBreakPenalty;
+    }
+    NewBreakBefore = false;
     if (!DryRun)
       Token->replaceWhitespaceBefore(LineIndex, RemainingTokenColumns,
                                      RemainingSpace, SplitBefore, Whitespaces);
     RemainingTokenColumns = Token->getLineLengthAfterSplitBefore(
         LineIndex, TailOffset, RemainingTokenColumns, ColumnLimit, SplitBefore);
     while (RemainingTokenColumns > RemainingSpace) {
+      DEBUG(llvm::dbgs() << "    Over limit, need: " << RemainingTokenColumns
+                         << ", space: " << RemainingSpace << "\n");
       BreakableToken::Split Split = Token->getSplit(
           LineIndex, TailOffset, ColumnLimit, CommentPragmasRegex);
       if (Split.first == StringRef::npos) {
@@ -1520,6 +1553,7 @@ unsigned ContinuationIndenter::breakProtrudingToken(const FormatToken &Current,
         if (LineIndex < EndIndex - 1)
           Penalty += Style.PenaltyExcessCharacter *
                      (RemainingTokenColumns - RemainingSpace);
+        DEBUG(llvm::dbgs() << "    No break opportunity.\n");
         break;
       }
       assert(Split.first != 0);
@@ -1534,6 +1568,37 @@ unsigned ContinuationIndenter::breakProtrudingToken(const FormatToken &Current,
         ReflowInProgress = true;
         if (!DryRun)
           Token->compressWhitespace(LineIndex, TailOffset, Split, Whitespaces);
+        DEBUG(llvm::dbgs() << "    Compressing below limit.\n");
+        break;
+      }
+
+      // Compute both the penalties for:
+      // - not breaking, and leaving excess characters
+      // - adding a new line break
+      assert(RemainingTokenColumnsAfterCompression > RemainingSpace);
+      unsigned ExcessCharactersPenalty =
+          (RemainingTokenColumnsAfterCompression - RemainingSpace) *
+          Style.PenaltyExcessCharacter;
+
+      unsigned BreakPenalty = NewBreakPenalty;
+      unsigned ColumnsUsed =
+          Token->getLineLengthAfterSplit(LineIndex, TailOffset, Split.first);
+      if (ColumnsUsed > ColumnLimit)
+        BreakPenalty +=
+            Style.PenaltyExcessCharacter * (ColumnsUsed - ColumnLimit);
+
+      DEBUG(llvm::dbgs() << "    Penalty excess: " << ExcessCharactersPenalty
+                         << "\n            break : " << BreakPenalty << "\n");
+      // Only continue to add the line break if the penalty of the excess
+      // characters is larger than the penalty of the line break.
+      // FIXME: This does not take into account when we can later remove the
+      // line break again due to a reflow.
+      if (ExcessCharactersPenalty < BreakPenalty) {
+        if (!DryRun)
+          Token->compressWhitespace(LineIndex, TailOffset, Split, Whitespaces);
+        // Do not set ReflowInProgress: we do not have any space left to
+        // reflow into.
+        Penalty += ExcessCharactersPenalty;
         break;
       }
 
@@ -1544,27 +1609,26 @@ unsigned ContinuationIndenter::breakProtrudingToken(const FormatToken &Current,
       // but will still be expanded to the next tab stop, so we don't save any
       // columns.
       if (NewRemainingTokenColumns == RemainingTokenColumns)
+        // FIXME: Do we need to adjust the penalty?
         break;
-
       assert(NewRemainingTokenColumns < RemainingTokenColumns);
+
       if (!DryRun)
         Token->insertBreak(LineIndex, TailOffset, Split, Whitespaces);
-      Penalty += Current.SplitPenalty;
-      unsigned ColumnsUsed =
-          Token->getLineLengthAfterSplit(LineIndex, TailOffset, Split.first);
-      if (ColumnsUsed > ColumnLimit) {
-        Penalty += Style.PenaltyExcessCharacter * (ColumnsUsed - ColumnLimit);
-      }
+
+      Penalty += BreakPenalty;
       TailOffset += Split.first + Split.second;
       RemainingTokenColumns = NewRemainingTokenColumns;
       ReflowInProgress = true;
       BreakInserted = true;
+      NewBreakBefore = true;
     }
   }
 
   BreakableToken::Split SplitAfterLastLine =
       Token->getSplitAfterLastLine(TailOffset, ColumnLimit);
   if (SplitAfterLastLine.first != StringRef::npos) {
+    DEBUG(llvm::dbgs() << "Replacing whitespace after last line.\n");
     if (!DryRun)
       Token->replaceWhitespaceAfterLastLine(TailOffset, SplitAfterLastLine,
                                             Whitespaces);
@@ -1586,9 +1650,6 @@ unsigned ContinuationIndenter::breakProtrudingToken(const FormatToken &Current,
     if (Current.is(TT_BlockComment))
       State.NoContinuation = true;
 
-    Penalty += Current.isStringLiteral() ? Style.PenaltyBreakString
-                                         : Style.PenaltyBreakComment;
-
     State.Stack.back().LastSpace = StartColumn;
   }
 
index d9b4094..7fbf01d 100644 (file)
@@ -8002,9 +8002,9 @@ TEST_F(FormatTest, BreakStringLiteralsBeforeUnbreakableTokenSequence) {
             "    \"f\");",
             format("someFunction1234567890(\"aaabbbcccdddeeefff\");",
                    getLLVMStyleWithColumns(24)));
-  EXPECT_EQ("someFunction(\"aaabbbcc \"\n"
-            "             \"ddde \"\n"
-            "             \"efff\");",
+  EXPECT_EQ("someFunction(\n"
+            "    \"aaabbbcc ddde \"\n"
+            "    \"efff\");",
             format("someFunction(\"aaabbbcc ddde efff\");",
                    getLLVMStyleWithColumns(25)));
   EXPECT_EQ("someFunction(\"aaabbbccc \"\n"
@@ -8023,10 +8023,9 @@ TEST_F(FormatTest, BreakStringLiteralsBeforeUnbreakableTokenSequence) {
             "  int i;",
             format("#define A string s = \"1234567890\"; int i;",
                    getLLVMStyleWithColumns(20)));
-  // FIXME: Put additional penalties on breaking at non-whitespace locations.
-  EXPECT_EQ("someFunction(\"aaabbbcc \"\n"
-            "             \"dddeeeff\"\n"
-            "             \"f\");",
+  EXPECT_EQ("someFunction(\n"
+            "    \"aaabbbcc \"\n"
+            "    \"dddeeefff\");",
             format("someFunction(\"aaabbbcc dddeeefff\");",
                    getLLVMStyleWithColumns(25)));
 }
@@ -9895,6 +9894,108 @@ TEST_F(FormatTest, UnderstandPragmaOption) {
   EXPECT_EQ("#pragma option -C -A", format("#pragma    option   -C   -A"));
 }
 
+TEST_F(FormatTest, OptimizeBreakPenaltyVsExcess) {
+  FormatStyle Style = getLLVMStyle();
+  Style.ColumnLimit = 20;
+
+  verifyFormat("int a; // the\n"
+               "       // comment", Style);
+  EXPECT_EQ("int a; /* first line\n"
+            "        * second\n"
+            "        * line third\n"
+            "        * line\n"
+            "        */",
+            format("int a; /* first line\n"
+                   "        * second\n"
+                   "        * line third\n"
+                   "        * line\n"
+                   "        */",
+                   Style));
+  EXPECT_EQ("int a; // first line\n"
+            "       // second\n"
+            "       // line third\n"
+            "       // line",
+            format("int a; // first line\n"
+                   "       // second line\n"
+                   "       // third line",
+                   Style));
+
+  Style.PenaltyExcessCharacter = 90;
+  verifyFormat("int a; // the comment", Style);
+  EXPECT_EQ("int a; // the\n"
+            "       // comment aa",
+            format("int a; // the comment aa", Style));
+  EXPECT_EQ("int a; // first line\n"
+            "       // second line\n"
+            "       // third line",
+            format("int a; // first line\n"
+                   "       // second line\n"
+                   "       // third line",
+                   Style));
+  EXPECT_EQ("int a; /* first line\n"
+            "        * second line\n"
+            "        * third line\n"
+            "        */",
+            format("int a; /* first line\n"
+                   "        * second line\n"
+                   "        * third line\n"
+                   "        */",
+                   Style));
+  // FIXME: Investigate why this is not getting the same layout as the test
+  // above.
+  EXPECT_EQ("int a; /* first line\n"
+            "        * second\n"
+            "        * line third\n"
+            "        * line\n"
+            "        */",
+            format("int a; /* first line second line third line"
+                   "\n*/",
+                   Style));
+
+  EXPECT_EQ("// foo bar baz bazfoo\n"
+            "// foo bar\n",
+            format("// foo bar baz bazfoo\n"
+                   "// foo            bar\n",
+                   Style));
+  EXPECT_EQ("// foo bar baz bazfoo\n"
+            "// foo bar\n",
+            format("// foo bar baz      bazfoo\n"
+                   "// foo            bar\n",
+                   Style));
+
+  // FIXME: Optimally, we'd keep bazfoo on the first line and reflow bar to the
+  // next one.
+  EXPECT_EQ("// foo bar baz\n"
+            "// bazfoo bar foo\n"
+            "// bar\n",
+            format("// foo bar baz      bazfoo bar\n"
+                   "// foo            bar\n",
+                   Style));
+
+  EXPECT_EQ("// foo bar baz bazfoo\n"
+            "// foo bar baz\n"
+            "// bazfoo bar foo\n"
+            "// bar\n",
+            format("// foo bar baz      bazfoo\n"
+                   "// foo bar baz      bazfoo bar\n"
+                   "// foo bar\n",
+                   Style));
+
+  // FIXME: Optimally, we'd keep 'bar' in the last line at the end of the line,
+  // as it does not actually protrude far enough to make breaking pay off.
+  // Unfortunately, due to how reflowing is currently implemented, we already
+  // check the column limit after the reflowing decision and extend the reflow
+  // range, so we will not take whitespace compression into account.
+  EXPECT_EQ("// foo bar baz bazfoo\n"
+            "// foo bar baz\n"
+            "// bazfoo bar foo\n"
+            "// bar\n",
+            format("// foo bar baz      bazfoo\n"
+                   "// foo bar baz      bazfoo bar\n"
+                   "// foo           bar\n",
+                   Style));
+}
+
 #define EXPECT_ALL_STYLES_EQUAL(Styles)                                        \
   for (size_t i = 1; i < Styles.size(); ++i)                                   \
   EXPECT_EQ(Styles[0], Styles[i]) << "Style #" << i << " of " << Styles.size() \
index 5467e73..cde3334 100644 (file)
@@ -2078,6 +2078,22 @@ TEST_F(FormatTestComments, ReflowsComments) {
                    "       // longsec\n",
                    getLLVMStyleWithColumns(20)));
 
+  // Simple case that correctly handles reflow in parameter lists.
+  EXPECT_EQ("a = f(/* looooooooong\n"
+            "       * long long\n"
+            "       */\n"
+            "      a);",
+            format("a = f(/* looooooooong long\n* long\n*/ a);",
+                   getLLVMStyleWithColumns(22)));
+  // Tricky case that has fewer lines if we reflow the comment, ending up with
+  // fewer lines.
+  EXPECT_EQ("a = f(/* loooooong\n"
+            "       * long long\n"
+            "       */\n"
+            "      a);",
+            format("a = f(/* loooooong long\n* long\n*/ a);",
+                   getLLVMStyleWithColumns(22)));
+
   // Keep empty comment lines.
   EXPECT_EQ("/**/", format(" /**/", getLLVMStyleWithColumns(20)));
   EXPECT_EQ("/* */", format(" /* */", getLLVMStyleWithColumns(20)));
@@ -2426,9 +2442,13 @@ TEST_F(FormatTestComments, BlockCommentsAtEndOfLine) {
 
 TEST_F(FormatTestComments, BreaksAfterMultilineBlockCommentsInParamLists) {
   EXPECT_EQ("a = f(/* long\n"
-            "         long\n"
-            "       */\n"
+            "         long */\n"
             "      a);",
+            format("a = f(/* long long */ a);", getLLVMStyleWithColumns(16)));
+  EXPECT_EQ("a = f(\n"
+            "    /* long\n"
+            "       long */\n"
+            "    a);",
             format("a = f(/* long long */ a);", getLLVMStyleWithColumns(15)));
 
   EXPECT_EQ("a = f(/* long\n"
@@ -2438,7 +2458,7 @@ TEST_F(FormatTestComments, BreaksAfterMultilineBlockCommentsInParamLists) {
             format("a = f(/* long\n"
                    "         long\n"
                    "       */a);",
-                   getLLVMStyleWithColumns(15)));
+                   getLLVMStyleWithColumns(16)));
 
   EXPECT_EQ("a = f(/* long\n"
             "         long\n"
@@ -2447,7 +2467,7 @@ TEST_F(FormatTestComments, BreaksAfterMultilineBlockCommentsInParamLists) {
             format("a = f(/* long\n"
                    "         long\n"
                    "       */ a);",
-                   getLLVMStyleWithColumns(15)));
+                   getLLVMStyleWithColumns(16)));
 
   EXPECT_EQ("a = f(/* long\n"
             "         long\n"
@@ -2456,23 +2476,35 @@ TEST_F(FormatTestComments, BreaksAfterMultilineBlockCommentsInParamLists) {
             format("a = f(/* long\n"
                    "         long\n"
                    "       */ (1 + 1));",
-                   getLLVMStyleWithColumns(15)));
+                   getLLVMStyleWithColumns(16)));
 
   EXPECT_EQ(
       "a = f(a,\n"
       "      /* long\n"
-      "         long\n"
-      "       */\n"
+      "         long */\n"
       "      b);",
+      format("a = f(a, /* long long */ b);", getLLVMStyleWithColumns(16)));
+  EXPECT_EQ(
+      "a = f(\n"
+      "    a,\n"
+      "    /* long\n"
+      "       long */\n"
+      "    b);",
       format("a = f(a, /* long long */ b);", getLLVMStyleWithColumns(15)));
 
-  EXPECT_EQ(
-      "a = f(a,\n"
-      "      /* long\n"
-      "         long\n"
-      "       */\n"
-      "      (1 + 1));",
-      format("a = f(a, /* long long */ (1 + 1));", getLLVMStyleWithColumns(15)));
+  EXPECT_EQ("a = f(a,\n"
+            "      /* long\n"
+            "         long */\n"
+            "      (1 + 1));",
+            format("a = f(a, /* long long */ (1 + 1));",
+                   getLLVMStyleWithColumns(16)));
+  EXPECT_EQ("a = f(\n"
+            "    a,\n"
+            "    /* long\n"
+            "       long */\n"
+            "    (1 + 1));",
+            format("a = f(a, /* long long */ (1 + 1));",
+                   getLLVMStyleWithColumns(15)));
 }
 
 TEST_F(FormatTestComments, IndentLineCommentsInStartOfBlockAtEndOfFile) {