@@ -840,6 +840,56 @@ FormatToken *FormatTokenLexer::getStashedToken() {
840840 return FormatTok;
841841}
842842
843+ // / Truncate the current token to the new length and make the lexer continue
844+ // / from the end of the truncated token. Used for other languages that have
845+ // / different token boundaries, like JavaScript in which a comment ends at a
846+ // / line break regardless of whether the line break follows a backslash. Also
847+ // / used to set the lexer to the end of whitespace if the lexer regards
848+ // / whitespace and an unrecognized symbol as one token.
849+ void FormatTokenLexer::truncateToken (size_t NewLen) {
850+ assert (NewLen <= FormatTok->TokenText .size ());
851+ resetLexer (SourceMgr.getFileOffset (Lex->getSourceLocation (
852+ Lex->getBufferLocation () - FormatTok->TokenText .size () + NewLen)));
853+ FormatTok->TokenText = FormatTok->TokenText .substr (0 , NewLen);
854+ FormatTok->ColumnWidth = encoding::columnWidthWithTabs (
855+ FormatTok->TokenText , FormatTok->OriginalColumn , Style.TabWidth ,
856+ Encoding);
857+ FormatTok->Tok .setLength (NewLen);
858+ }
859+
860+ // / Count the length of leading whitespace in a token.
861+ static size_t countLeadingWhitespace (StringRef Text) {
862+ // Basically counting the length matched by this regex.
863+ // "^([\n\r\f\v \t]|(\\\\|\\?\\?/)[\n\r])+"
864+ // Directly using the regex turned out to be slow. With the regex
865+ // version formatting all files in this directory took about 1.25
866+ // seconds. This version took about 0.5 seconds.
867+ const char *Cur = Text.begin ();
868+ while (Cur < Text.end ()) {
869+ if (isspace (Cur[0 ])) {
870+ ++Cur;
871+ } else if (Cur[0 ] == ' \\ ' && (Cur[1 ] == ' \n ' || Cur[1 ] == ' \r ' )) {
872+ // A '\' followed by a newline always escapes the newline, regardless
873+ // of whether there is another '\' before it.
874+ // The source has a null byte at the end. So the end of the entire input
875+ // isn't reached yet. Also the lexer doesn't break apart an escaped
876+ // newline.
877+ assert (Text.end () - Cur >= 2 );
878+ Cur += 2 ;
879+ } else if (Cur[0 ] == ' ?' && Cur[1 ] == ' ?' && Cur[2 ] == ' /' &&
880+ (Cur[3 ] == ' \n ' || Cur[3 ] == ' \r ' )) {
881+ // Newlines can also be escaped by a '?' '?' '/' trigraph. By the way, the
882+ // characters are quoted individually in this comment because if we write
883+ // them together some compilers warn that we have a trigraph in the code.
884+ assert (Text.end () - Cur >= 4 );
885+ Cur += 4 ;
886+ } else {
887+ break ;
888+ }
889+ }
890+ return Cur - Text.begin ();
891+ }
892+
843893FormatToken *FormatTokenLexer::getNextToken () {
844894 if (StateStack.top () == LexerState::TOKEN_STASHED) {
845895 StateStack.pop ();
@@ -854,34 +904,33 @@ FormatToken *FormatTokenLexer::getNextToken() {
854904 IsFirstToken = false ;
855905
856906 // Consume and record whitespace until we find a significant token.
907+ // Some tok::unknown tokens are not just whitespace, e.g. whitespace
908+ // followed by a symbol such as backtick. Those symbols may be
909+ // significant in other languages.
857910 unsigned WhitespaceLength = TrailingWhitespace;
858- while (FormatTok->is (tok::unknown)) {
911+ while (FormatTok->isNot (tok::eof)) {
912+ auto LeadingWhitespace = countLeadingWhitespace (FormatTok->TokenText );
913+ if (LeadingWhitespace == 0 )
914+ break ;
915+ if (LeadingWhitespace < FormatTok->TokenText .size ())
916+ truncateToken (LeadingWhitespace);
859917 StringRef Text = FormatTok->TokenText ;
860- auto EscapesNewline = [&](int pos) {
861- // A '\r' here is just part of '\r\n'. Skip it.
862- if (pos >= 0 && Text[pos] == ' \r ' )
863- --pos;
864- // See whether there is an odd number of '\' before this.
865- // FIXME: This is wrong. A '\' followed by a newline is always removed,
866- // regardless of whether there is another '\' before it.
867- // FIXME: Newlines can also be escaped by a '?' '?' '/' trigraph.
868- unsigned count = 0 ;
869- for (; pos >= 0 ; --pos, ++count)
870- if (Text[pos] != ' \\ ' )
871- break ;
872- return count & 1 ;
873- };
874- // FIXME: This miscounts tok:unknown tokens that are not just
875- // whitespace, e.g. a '`' character.
918+ bool InEscape = false ;
876919 for (int i = 0 , e = Text.size (); i != e; ++i) {
877920 switch (Text[i]) {
921+ case ' \r ' :
922+ // If this is a CRLF sequence, break here and the LF will be handled on
923+ // the next loop iteration. Otherwise, this is a single Mac CR, treat it
924+ // the same as a single LF.
925+ if (i + 1 < e && Text[i + 1 ] == ' \n ' )
926+ break ;
927+ LLVM_FALLTHROUGH;
878928 case ' \n ' :
879929 ++FormatTok->NewlinesBefore ;
880- FormatTok->HasUnescapedNewline = !EscapesNewline (i - 1 );
881- FormatTok->LastNewlineOffset = WhitespaceLength + i + 1 ;
882- Column = 0 ;
883- break ;
884- case ' \r ' :
930+ if (!InEscape)
931+ FormatTok->HasUnescapedNewline = true ;
932+ else
933+ InEscape = false ;
885934 FormatTok->LastNewlineOffset = WhitespaceLength + i + 1 ;
886935 Column = 0 ;
887936 break ;
@@ -897,24 +946,32 @@ FormatToken *FormatTokenLexer::getNextToken() {
897946 Style.TabWidth - (Style.TabWidth ? Column % Style.TabWidth : 0 );
898947 break ;
899948 case ' \\ ' :
900- if (i + 1 == e || (Text[i + 1 ] != ' \r ' && Text[i + 1 ] != ' \n ' ))
901- FormatTok->setType (TT_ImplicitStringLiteral);
949+ case ' ?' :
950+ case ' /' :
951+ // The text was entirely whitespace when this loop was entered. Thus
952+ // this has to be an escape sequence.
953+ assert (Text.substr (i, 2 ) == " \\\r " || Text.substr (i, 2 ) == " \\\n " ||
954+ Text.substr (i, 4 ) == " \?\? /\r " ||
955+ Text.substr (i, 4 ) == " \?\? /\n " ||
956+ (i >= 1 && (Text.substr (i - 1 , 4 ) == " \?\? /\r " ||
957+ Text.substr (i - 1 , 4 ) == " \?\? /\n " )) ||
958+ (i >= 2 && (Text.substr (i - 2 , 4 ) == " \?\? /\r " ||
959+ Text.substr (i - 2 , 4 ) == " \?\? /\n " )));
960+ InEscape = true ;
902961 break ;
903962 default :
904- FormatTok->setType (TT_ImplicitStringLiteral);
963+ // This shouldn't happen.
964+ assert (false );
905965 break ;
906966 }
907- if (FormatTok->getType () == TT_ImplicitStringLiteral)
908- break ;
909967 }
910-
911- if (FormatTok->is (TT_ImplicitStringLiteral))
912- break ;
913- WhitespaceLength += FormatTok->Tok .getLength ();
914-
968+ WhitespaceLength += Text.size ();
915969 readRawToken (*FormatTok);
916970 }
917971
972+ if (FormatTok->is (tok::unknown))
973+ FormatTok->setType (TT_ImplicitStringLiteral);
974+
918975 // JavaScript and Java do not allow to escape the end of the line with a
919976 // backslash. Backslashes are syntax errors in plain source, but can occur in
920977 // comments. When a single line comment ends with a \, it'll cause the next
@@ -928,42 +985,13 @@ FormatToken *FormatTokenLexer::getNextToken() {
928985 while (BackslashPos != StringRef::npos) {
929986 if (BackslashPos + 1 < FormatTok->TokenText .size () &&
930987 FormatTok->TokenText [BackslashPos + 1 ] == ' \n ' ) {
931- const char *Offset = Lex->getBufferLocation ();
932- Offset -= FormatTok->TokenText .size ();
933- Offset += BackslashPos + 1 ;
934- resetLexer (SourceMgr.getFileOffset (Lex->getSourceLocation (Offset)));
935- FormatTok->TokenText = FormatTok->TokenText .substr (0 , BackslashPos + 1 );
936- FormatTok->ColumnWidth = encoding::columnWidthWithTabs (
937- FormatTok->TokenText , FormatTok->OriginalColumn , Style.TabWidth ,
938- Encoding);
988+ truncateToken (BackslashPos + 1 );
939989 break ;
940990 }
941991 BackslashPos = FormatTok->TokenText .find (' \\ ' , BackslashPos + 1 );
942992 }
943993 }
944994
945- // In case the token starts with escaped newlines, we want to
946- // take them into account as whitespace - this pattern is quite frequent
947- // in macro definitions.
948- // FIXME: Add a more explicit test.
949- while (FormatTok->TokenText .size () > 1 && FormatTok->TokenText [0 ] == ' \\ ' ) {
950- unsigned SkippedWhitespace = 0 ;
951- if (FormatTok->TokenText .size () > 2 &&
952- (FormatTok->TokenText [1 ] == ' \r ' && FormatTok->TokenText [2 ] == ' \n ' )) {
953- SkippedWhitespace = 3 ;
954- } else if (FormatTok->TokenText [1 ] == ' \n ' ) {
955- SkippedWhitespace = 2 ;
956- } else {
957- break ;
958- }
959-
960- ++FormatTok->NewlinesBefore ;
961- WhitespaceLength += SkippedWhitespace;
962- FormatTok->LastNewlineOffset = SkippedWhitespace;
963- Column = 0 ;
964- FormatTok->TokenText = FormatTok->TokenText .substr (SkippedWhitespace);
965- }
966-
967995 FormatTok->WhitespaceRange = SourceRange (
968996 WhitespaceStart, WhitespaceStart.getLocWithOffset (WhitespaceLength));
969997
0 commit comments