Skip to content

Commit 3e3e4ea

Browse files
committed
[llvm][mustache] Use single pass when tokenizing
The old implementation used many string searches over the same portions of the strings. This version sacrifices some API niceness for perf wins. Metric | Baseline | Single-Pass | Change -------------- | -------- | ----------- | ------- Time (ms) | 36.09 | 35.78 | -0.86% Cycles | 35.3M | 35.0M | -0.79% Instructions | 86.7M | 85.8M | -1.03% Branch Misses | 116K | 114K | -1.91% Cache Misses | 244K | 232K | -4.98%
1 parent 11fe600 commit 3e3e4ea

File tree

1 file changed

+73
-113
lines changed

1 file changed

+73
-113
lines changed

llvm/lib/Support/Mustache.cpp

Lines changed: 73 additions & 113 deletions
Original file line numberDiff line numberDiff line change
@@ -368,141 +368,101 @@ static const char *jsonKindToString(json::Value::Kind K) {
368368
llvm_unreachable("Unknown json::Value::Kind");
369369
}
370370

371-
static Tag findNextTag(StringRef Template, size_t StartPos, StringRef Open,
372-
StringRef Close) {
373-
const StringLiteral TripleOpen("{{{");
374-
const StringLiteral TripleClose("}}}");
375-
376-
size_t NormalOpenPos = Template.find(Open, StartPos);
377-
size_t TripleOpenPos = Template.find(TripleOpen, StartPos);
378-
379-
Tag Result;
380-
381-
// Determine which tag comes first.
382-
if (TripleOpenPos != StringRef::npos &&
383-
(NormalOpenPos == StringRef::npos || TripleOpenPos <= NormalOpenPos)) {
384-
// Found a triple mustache tag.
385-
size_t EndPos =
386-
Template.find(TripleClose, TripleOpenPos + TripleOpen.size());
387-
if (EndPos == StringRef::npos)
388-
return Result; // No closing tag found.
389-
390-
Result.TagKind = Tag::Kind::Triple;
391-
Result.StartPosition = TripleOpenPos;
392-
size_t ContentStart = TripleOpenPos + TripleOpen.size();
393-
Result.Content = Template.substr(ContentStart, EndPos - ContentStart);
394-
Result.FullMatch = Template.substr(
395-
TripleOpenPos, (EndPos + TripleClose.size()) - TripleOpenPos);
396-
} else if (NormalOpenPos != StringRef::npos) {
397-
// Found a normal mustache tag.
398-
size_t EndPos = Template.find(Close, NormalOpenPos + Open.size());
399-
if (EndPos == StringRef::npos)
400-
return Result; // No closing tag found.
401-
402-
Result.TagKind = Tag::Kind::Normal;
403-
Result.StartPosition = NormalOpenPos;
404-
size_t ContentStart = NormalOpenPos + Open.size();
405-
Result.Content = Template.substr(ContentStart, EndPos - ContentStart);
406-
Result.FullMatch =
407-
Template.substr(NormalOpenPos, (EndPos + Close.size()) - NormalOpenPos);
408-
}
409-
410-
return Result;
411-
}
412-
413-
static std::optional<std::pair<StringRef, StringRef>>
414-
processTag(const Tag &T, SmallVectorImpl<Token> &Tokens, MustacheContext &Ctx) {
415-
LLVM_DEBUG(dbgs() << "[Tag] " << T.FullMatch << ", Content: " << T.Content
416-
<< ", Kind: " << tagKindToString(T.TagKind) << "\n");
417-
if (T.TagKind == Tag::Kind::Triple) {
418-
Tokens.emplace_back(T.FullMatch, Ctx.Saver.save("&" + T.Content), '&', Ctx);
419-
return std::nullopt;
420-
}
421-
StringRef Interpolated = T.Content;
422-
if (!Interpolated.trim().starts_with("=")) {
423-
char Front = Interpolated.empty() ? ' ' : Interpolated.trim().front();
424-
Tokens.emplace_back(T.FullMatch, Interpolated, Front, Ctx);
425-
return std::nullopt;
426-
}
427-
Tokens.emplace_back(T.FullMatch, Interpolated, '=', Ctx);
428-
StringRef DelimSpec = Interpolated.trim();
429-
DelimSpec = DelimSpec.drop_front(1);
430-
DelimSpec = DelimSpec.take_until([](char C) { return C == '='; });
431-
DelimSpec = DelimSpec.trim();
432-
433-
std::pair<StringRef, StringRef> Ret = DelimSpec.split(' ');
434-
LLVM_DEBUG(dbgs() << "[Set Delimiter] NewOpen: " << Ret.first
435-
<< ", NewClose: " << Ret.second << "\n");
436-
return Ret;
437-
}
438-
439371
// Simple tokenizer that splits the template into tokens.
440-
// The mustache spec allows {{{ }}} to unescape variables,
441-
// but we don't support that here. An unescape variable
442-
// is represented only by {{& variable}}.
443372
static SmallVector<Token> tokenize(StringRef Template, MustacheContext &Ctx) {
444373
LLVM_DEBUG(dbgs() << "[Tokenize Template] \"" << Template << "\"\n");
445374
SmallVector<Token> Tokens;
446375
SmallString<8> Open("{{");
447376
SmallString<8> Close("}}");
448-
size_t Start = 0;
377+
size_t Cursor = 0;
378+
size_t TextStart = 0;
379+
380+
const StringLiteral TripleOpen("{{{");
381+
const StringLiteral TripleClose("}}}");
449382

450-
while (Start < Template.size()) {
451-
LLVM_DEBUG(dbgs() << "[Tokenize Loop] Start=" << Start << ", Open='" << Open
452-
<< "', Close='" << Close << "'\n");
453-
Tag T = findNextTag(Template, Start, Open, Close);
383+
while (Cursor < Template.size()) {
384+
StringRef TemplateSuffix = Template.substr(Cursor);
385+
StringRef TagOpen, TagClose;
386+
Tag::Kind Kind;
387+
388+
// Determine which tag we've encountered.
389+
if (TemplateSuffix.starts_with(TripleOpen)) {
390+
Kind = Tag::Kind::Triple;
391+
TagOpen = TripleOpen;
392+
TagClose = TripleClose;
393+
} else if (TemplateSuffix.starts_with(Open)) {
394+
Kind = Tag::Kind::Normal;
395+
TagOpen = Open;
396+
TagClose = Close;
397+
} else {
398+
// Not at a tag, continue scanning.
399+
++Cursor;
400+
continue;
401+
}
454402

455-
if (T.TagKind == Tag::Kind::None) {
456-
// No more tags, the rest is text.
457-
Tokens.emplace_back(Template.substr(Start));
458-
break;
403+
// Found a tag, first add the preceding text.
404+
if (Cursor > TextStart) {
405+
Tokens.emplace_back(Template.slice(TextStart, Cursor));
459406
}
460407

461-
// Add the text before the tag.
462-
if (T.StartPosition > Start) {
463-
StringRef Text = Template.substr(Start, T.StartPosition - Start);
464-
Tokens.emplace_back(Text);
408+
// Find the closing tag.
409+
size_t EndPos = Template.find(TagClose, Cursor + TagOpen.size());
410+
if (EndPos == StringRef::npos) {
411+
// No closing tag, the rest is text.
412+
Tokens.emplace_back(Template.substr(Cursor));
413+
TextStart = Cursor = Template.size();
414+
break;
465415
}
466416

467-
if (auto NewDelims = processTag(T, Tokens, Ctx)) {
468-
std::tie(Open, Close) = *NewDelims;
417+
// Extract tag content and full match.
418+
size_t ContentStart = Cursor + TagOpen.size();
419+
StringRef Content = Template.substr(ContentStart, EndPos - ContentStart);
420+
StringRef FullMatch =
421+
Template.substr(Cursor, (EndPos + TagClose.size()) - Cursor);
422+
423+
// Process the tag (inlined logic from processTag).
424+
LLVM_DEBUG(dbgs() << "[Tag] " << FullMatch << ", Content: " << Content
425+
<< ", Kind: " << tagKindToString(Kind) << "\n");
426+
if (Kind == Tag::Kind::Triple) {
427+
Tokens.emplace_back(FullMatch, Ctx.Saver.save("&" + Content), '&', Ctx);
428+
} else { // Normal Tag
429+
StringRef Interpolated = Content;
430+
if (!Interpolated.trim().starts_with("=")) {
431+
char Front = Interpolated.empty() ? ' ' : Interpolated.trim().front();
432+
Tokens.emplace_back(FullMatch, Interpolated, Front, Ctx);
433+
} else { // Set Delimiter
434+
Tokens.emplace_back(FullMatch, Interpolated, '=', Ctx);
435+
StringRef DelimSpec = Interpolated.trim();
436+
DelimSpec = DelimSpec.drop_front(1);
437+
DelimSpec = DelimSpec.take_until([](char C) { return C == '='; });
438+
DelimSpec = DelimSpec.trim();
439+
440+
auto [NewOpen, NewClose] = DelimSpec.split(' ');
441+
LLVM_DEBUG(dbgs() << "[Set Delimiter] NewOpen: " << NewOpen
442+
<< ", NewClose: " << NewClose << "\n");
443+
Open = NewOpen;
444+
Close = NewClose;
445+
}
469446
}
470447

471-
// Move past the tag.
472-
Start = T.StartPosition + T.FullMatch.size();
448+
// Move past the tag for the next iteration.
449+
Cursor += FullMatch.size();
450+
TextStart = Cursor;
473451
}
474452

475-
// Fix up white spaces for:
476-
// - open sections
477-
// - inverted sections
478-
// - close sections
479-
// - comments
480-
//
481-
// This loop attempts to find standalone tokens and tries to trim out
482-
// the surrounding whitespace.
483-
// For example:
484-
// if you have the template string
485-
// {{#section}} \n Example \n{{/section}}
486-
// The output should would be
487-
// For example:
488-
// \n Example \n
453+
// Add any remaining text after the last tag.
454+
if (TextStart < Template.size()) {
455+
Tokens.emplace_back(Template.substr(TextStart));
456+
}
457+
458+
// Fix up white spaces for standalone tags.
489459
size_t LastIdx = Tokens.size() - 1;
490460
for (size_t Idx = 0, End = Tokens.size(); Idx < End; ++Idx) {
491461
Token &CurrentToken = Tokens[Idx];
492462
Token::Type CurrentType = CurrentToken.getType();
493-
// Check if token type requires cleanup.
494-
bool RequiresCleanUp = requiresCleanUp(CurrentType);
495-
496-
if (!RequiresCleanUp)
463+
if (!requiresCleanUp(CurrentType))
497464
continue;
498465

499-
// We adjust the token body if there's no text behind or ahead.
500-
// A token is considered to have no text ahead if the right of the previous
501-
// token is a newline followed by spaces.
502-
// A token is considered to have no text behind if the left of the next
503-
// token is spaces followed by a newline.
504-
// eg.
505-
// "Line 1\n {{#section}} \n Line 2 \n {{/section}} \n Line 3"
506466
bool HasTextBehind = hasTextBehind(Idx, Tokens);
507467
bool HasTextAhead = hasTextAhead(Idx, Tokens);
508468

0 commit comments

Comments
 (0)