[llvm][mustache] Use single pass when tokenizing #159196

ilovepi · 2025-09-16T22:08:41Z

The old implementation used many string searches over the same portions
of the strings. This version sacrifices some API niceness for perf wins.

Metric	Baseline	Single-Pass	Change
Time (ms)	36.09	35.78	-0.86%
Cycles	35.3M	35.0M	-0.79%
Instructions	86.7M	85.8M	-1.03%
Branch Misses	116K	114K	-1.91%
Cache Misses	244K	232K	-4.98%

ilovepi · 2025-09-16T22:08:56Z

This stack of pull requests is managed by Graphite. Learn more about stacking.

llvmbot · 2025-09-16T22:09:20Z

@llvm/pr-subscribers-llvm-support

Author: Paul Kirth (ilovepi)

Changes

The old implementation used many string searches over the same portions
of the strings. This version sacrifices some API niceness for perf wins.

Metric	Baseline	Single-Pass	Change
Time (ms)	36.09	35.78	-0.86%
Cycles	35.3M	35.0M	-0.79%
Instructions	86.7M	85.8M	-1.03%
Branch Misses	116K	114K	-1.91%
Cache Misses	244K	232K	-4.98%

Full diff: https://github.com/llvm/llvm-project/pull/159196.diff

1 Files Affected:

(modified) llvm/lib/Support/Mustache.cpp (+73-113)

diff --git a/llvm/lib/Support/Mustache.cpp b/llvm/lib/Support/Mustache.cpp
index f8095a4eb1acc..63798c50f57ee 100644
--- a/llvm/lib/Support/Mustache.cpp
+++ b/llvm/lib/Support/Mustache.cpp
@@ -371,141 +371,101 @@ static const char *jsonKindToString(json::Value::Kind K) {
   llvm_unreachable("Unknown json::Value::Kind");
 }
 
-static Tag findNextTag(StringRef Template, size_t StartPos, StringRef Open,
-                       StringRef Close) {
-  const StringLiteral TripleOpen("{{{");
-  const StringLiteral TripleClose("}}}");
-
-  size_t NormalOpenPos = Template.find(Open, StartPos);
-  size_t TripleOpenPos = Template.find(TripleOpen, StartPos);
-
-  Tag Result;
-
-  // Determine which tag comes first.
-  if (TripleOpenPos != StringRef::npos &&
-      (NormalOpenPos == StringRef::npos || TripleOpenPos <= NormalOpenPos)) {
-    // Found a triple mustache tag.
-    size_t EndPos =
-        Template.find(TripleClose, TripleOpenPos + TripleOpen.size());
-    if (EndPos == StringRef::npos)
-      return Result; // No closing tag found.
-
-    Result.TagKind = Tag::Kind::Triple;
-    Result.StartPosition = TripleOpenPos;
-    size_t ContentStart = TripleOpenPos + TripleOpen.size();
-    Result.Content = Template.substr(ContentStart, EndPos - ContentStart);
-    Result.FullMatch = Template.substr(
-        TripleOpenPos, (EndPos + TripleClose.size()) - TripleOpenPos);
-  } else if (NormalOpenPos != StringRef::npos) {
-    // Found a normal mustache tag.
-    size_t EndPos = Template.find(Close, NormalOpenPos + Open.size());
-    if (EndPos == StringRef::npos)
-      return Result; // No closing tag found.
-
-    Result.TagKind = Tag::Kind::Normal;
-    Result.StartPosition = NormalOpenPos;
-    size_t ContentStart = NormalOpenPos + Open.size();
-    Result.Content = Template.substr(ContentStart, EndPos - ContentStart);
-    Result.FullMatch =
-        Template.substr(NormalOpenPos, (EndPos + Close.size()) - NormalOpenPos);
-  }
-
-  return Result;
-}
-
-static std::optional<std::pair<StringRef, StringRef>>
-processTag(const Tag &T, SmallVectorImpl<Token> &Tokens, MustacheContext &Ctx) {
-  LLVM_DEBUG(dbgs() << "[Tag] " << T.FullMatch << ", Content: " << T.Content
-                    << ", Kind: " << tagKindToString(T.TagKind) << "\n");
-  if (T.TagKind == Tag::Kind::Triple) {
-    Tokens.emplace_back(T.FullMatch, Ctx.Saver.save("&" + T.Content), '&', Ctx);
-    return std::nullopt;
-  }
-  StringRef Interpolated = T.Content;
-  if (!Interpolated.trim().starts_with("=")) {
-    char Front = Interpolated.empty() ? ' ' : Interpolated.trim().front();
-    Tokens.emplace_back(T.FullMatch, Interpolated, Front, Ctx);
-    return std::nullopt;
-  }
-  Tokens.emplace_back(T.FullMatch, Interpolated, '=', Ctx);
-  StringRef DelimSpec = Interpolated.trim();
-  DelimSpec = DelimSpec.drop_front(1);
-  DelimSpec = DelimSpec.take_until([](char C) { return C == '='; });
-  DelimSpec = DelimSpec.trim();
-
-  auto [NewOpen, NewClose] = DelimSpec.split(' ');
-  LLVM_DEBUG(dbgs() << "[Set Delimiter] NewOpen: " << NewOpen
-                    << ", NewClose: " << NewClose << "\n");
-  return std::make_pair(NewOpen, NewClose);
-}
-
 // Simple tokenizer that splits the template into tokens.
-// The mustache spec allows {{{ }}} to unescape variables,
-// but we don't support that here. An unescape variable
-// is represented only by {{& variable}}.
 static SmallVector<Token> tokenize(StringRef Template, MustacheContext &Ctx) {
   LLVM_DEBUG(dbgs() << "[Tokenize Template] \"" << Template << "\"\n");
   SmallVector<Token> Tokens;
   SmallString<8> Open("{{");
   SmallString<8> Close("}}");
-  size_t Start = 0;
+  size_t Cursor = 0;
+  size_t TextStart = 0;
+
+  const StringLiteral TripleOpen("{{{");
+  const StringLiteral TripleClose("}}}");
 
-  while (Start < Template.size()) {
-    LLVM_DEBUG(dbgs() << "[Tokenize Loop] Start=" << Start << ", Open='" << Open
-                      << "', Close='" << Close << "'\n");
-    Tag T = findNextTag(Template, Start, Open, Close);
+  while (Cursor < Template.size()) {
+    StringRef TemplateSuffix = Template.substr(Cursor);
+    StringRef TagOpen, TagClose;
+    Tag::Kind Kind;
+
+    // Determine which tag we've encountered.
+    if (TemplateSuffix.starts_with(TripleOpen)) {
+      Kind = Tag::Kind::Triple;
+      TagOpen = TripleOpen;
+      TagClose = TripleClose;
+    } else if (TemplateSuffix.starts_with(Open)) {
+      Kind = Tag::Kind::Normal;
+      TagOpen = Open;
+      TagClose = Close;
+    } else {
+      // Not at a tag, continue scanning.
+      ++Cursor;
+      continue;
+    }
 
-    if (T.TagKind == Tag::Kind::None) {
-      // No more tags, the rest is text.
-      Tokens.emplace_back(Template.substr(Start));
-      break;
+    // Found a tag, first add the preceding text.
+    if (Cursor > TextStart) {
+      Tokens.emplace_back(Template.slice(TextStart, Cursor));
     }
 
-    // Add the text before the tag.
-    if (T.StartPosition > Start) {
-      StringRef Text = Template.substr(Start, T.StartPosition - Start);
-      Tokens.emplace_back(Text);
+    // Find the closing tag.
+    size_t EndPos = Template.find(TagClose, Cursor + TagOpen.size());
+    if (EndPos == StringRef::npos) {
+      // No closing tag, the rest is text.
+      Tokens.emplace_back(Template.substr(Cursor));
+      TextStart = Cursor = Template.size();
+      break;
     }
 
-    if (auto NewDelims = processTag(T, Tokens, Ctx)) {
-      std::tie(Open, Close) = *NewDelims;
+    // Extract tag content and full match.
+    size_t ContentStart = Cursor + TagOpen.size();
+    StringRef Content = Template.substr(ContentStart, EndPos - ContentStart);
+    StringRef FullMatch =
+        Template.substr(Cursor, (EndPos + TagClose.size()) - Cursor);
+
+    // Process the tag (inlined logic from processTag).
+    LLVM_DEBUG(dbgs() << "[Tag] " << FullMatch << ", Content: " << Content
+                      << ", Kind: " << tagKindToString(Kind) << "\n");
+    if (Kind == Tag::Kind::Triple) {
+      Tokens.emplace_back(FullMatch, Ctx.Saver.save("&" + Content), '&', Ctx);
+    } else { // Normal Tag
+      StringRef Interpolated = Content;
+      if (!Interpolated.trim().starts_with("=")) {
+        char Front = Interpolated.empty() ? ' ' : Interpolated.trim().front();
+        Tokens.emplace_back(FullMatch, Interpolated, Front, Ctx);
+      } else { // Set Delimiter
+        Tokens.emplace_back(FullMatch, Interpolated, '=', Ctx);
+        StringRef DelimSpec = Interpolated.trim();
+        DelimSpec = DelimSpec.drop_front(1);
+        DelimSpec = DelimSpec.take_until([](char C) { return C == '='; });
+        DelimSpec = DelimSpec.trim();
+
+        auto [NewOpen, NewClose] = DelimSpec.split(' ');
+        LLVM_DEBUG(dbgs() << "[Set Delimiter] NewOpen: " << NewOpen
+                          << ", NewClose: " << NewClose << "\n");
+        Open = NewOpen;
+        Close = NewClose;
+      }
     }
 
-    // Move past the tag.
-    Start = T.StartPosition + T.FullMatch.size();
+    // Move past the tag for the next iteration.
+    Cursor += FullMatch.size();
+    TextStart = Cursor;
   }
 
-  // Fix up white spaces for:
-  //   - open sections
-  //   - inverted sections
-  //   - close sections
-  //   - comments
-  //
-  // This loop attempts to find standalone tokens and tries to trim out
-  // the surrounding whitespace.
-  // For example:
-  // if you have the template string
-  //  {{#section}} \n Example \n{{/section}}
-  // The output should would be
-  // For example:
-  //  \n Example \n
+  // Add any remaining text after the last tag.
+  if (TextStart < Template.size()) {
+    Tokens.emplace_back(Template.substr(TextStart));
+  }
+
+  // Fix up white spaces for standalone tags.
   size_t LastIdx = Tokens.size() - 1;
   for (size_t Idx = 0, End = Tokens.size(); Idx < End; ++Idx) {
     Token &CurrentToken = Tokens[Idx];
     Token::Type CurrentType = CurrentToken.getType();
-    // Check if token type requires cleanup.
-    bool RequiresCleanUp = requiresCleanUp(CurrentType);
-
-    if (!RequiresCleanUp)
+    if (!requiresCleanUp(CurrentType))
       continue;
 
-    // We adjust the token body if there's no text behind or ahead.
-    // A token is considered to have no text ahead if the right of the previous
-    // token is a newline followed by spaces.
-    // A token is considered to have no text behind if the left of the next
-    // token is spaces followed by a newline.
-    // eg.
-    //  "Line 1\n {{#section}} \n Line 2 \n {{/section}} \n Line 3"
     bool HasTextBehind = hasTextBehind(Idx, Tokens);
     bool HasTextAhead = hasTextAhead(Idx, Tokens);

The old implementation used many string searches over the same portions of the strings. This version sacrifices some API niceness for perf wins. Metric | Baseline | Single-Pass | Change -------------- | -------- | ----------- | ------- Time (ms) | 36.09 | 35.78 | -0.86% Cycles | 35.3M | 35.0M | -0.79% Instructions | 86.7M | 85.8M | -1.03% Branch Misses | 116K | 114K | -1.91% Cache Misses | 244K | 232K | -4.98%

evelez7

LGTM with a couple styling nits

evelez7 · 2025-10-20T20:49:50Z

llvm/lib/Support/Mustache.cpp

-      Tokens.emplace_back(Template.substr(Start));
-      break;
+    // Found a tag, first add the preceding text.
+    if (Cursor > TextStart) {


nit: I don't think this if needs braces.

evelez7 · 2025-10-20T20:50:56Z

llvm/lib/Support/Mustache.cpp

-  // For example:
-  //  \n Example \n
+  // Add any remaining text after the last tag.
+  if (TextStart < Template.size()) {


same nit I don't think this needs braces

ilovepi requested review from evelez7 and petrhosek September 16, 2025 22:08

llvmbot added the llvm:support label Sep 16, 2025

ilovepi force-pushed the users/ilovepi/mustache-json-copy-opt branch from b65b639 to 749306f Compare September 22, 2025 17:07

ilovepi force-pushed the users/ilovepi/mustache-tokeniser-opt branch 2 times, most recently from 3ac408e to b929e27 Compare September 22, 2025 17:56

ilovepi force-pushed the users/ilovepi/mustache-json-copy-opt branch from 1389ea3 to 8ee94b4 Compare September 25, 2025 22:12

ilovepi force-pushed the users/ilovepi/mustache-tokeniser-opt branch 2 times, most recently from 1a9b251 to dba238e Compare September 26, 2025 01:55

ilovepi force-pushed the users/ilovepi/mustache-json-copy-opt branch 2 times, most recently from 5b2c23c to 8b1efc3 Compare September 29, 2025 17:39

ilovepi force-pushed the users/ilovepi/mustache-tokeniser-opt branch from dba238e to 038f7b2 Compare September 29, 2025 17:39

ilovepi force-pushed the users/ilovepi/mustache-json-copy-opt branch from 8b1efc3 to 964edee Compare September 29, 2025 22:28

ilovepi force-pushed the users/ilovepi/mustache-tokeniser-opt branch 2 times, most recently from 5c872a1 to cbc1f38 Compare September 30, 2025 01:54

ilovepi force-pushed the users/ilovepi/mustache-json-copy-opt branch 2 times, most recently from a920b52 to b9da266 Compare September 30, 2025 03:47

ilovepi force-pushed the users/ilovepi/mustache-tokeniser-opt branch from cbc1f38 to 3d520d8 Compare September 30, 2025 03:47

ilovepi force-pushed the users/ilovepi/mustache-json-copy-opt branch from b9da266 to 11fe600 Compare September 30, 2025 03:48

ilovepi force-pushed the users/ilovepi/mustache-tokeniser-opt branch from 3d520d8 to 3e3e4ea Compare September 30, 2025 03:48

ilovepi force-pushed the users/ilovepi/mustache-json-copy-opt branch from 11fe600 to 59b5eda Compare October 1, 2025 00:04

ilovepi force-pushed the users/ilovepi/mustache-tokeniser-opt branch 2 times, most recently from 709be29 to 537d1d7 Compare October 1, 2025 00:15

ilovepi force-pushed the users/ilovepi/mustache-json-copy-opt branch from 59b5eda to 4b5d45b Compare October 1, 2025 00:15

ilovepi force-pushed the users/ilovepi/mustache-tokeniser-opt branch from 537d1d7 to fbaaa82 Compare October 6, 2025 17:21

ilovepi force-pushed the users/ilovepi/mustache-json-copy-opt branch 2 times, most recently from 6020deb to 8474924 Compare October 6, 2025 20:26

ilovepi force-pushed the users/ilovepi/mustache-tokeniser-opt branch from fbaaa82 to a69791c Compare October 6, 2025 20:26

ilovepi force-pushed the users/ilovepi/mustache-json-copy-opt branch from 8474924 to 3289df4 Compare October 9, 2025 17:51

ilovepi force-pushed the users/ilovepi/mustache-tokeniser-opt branch from a69791c to 4aa6943 Compare October 9, 2025 17:51

Base automatically changed from users/ilovepi/mustache-json-copy-opt to main October 9, 2025 18:27

ilovepi force-pushed the users/ilovepi/mustache-tokeniser-opt branch from 4aa6943 to 3338a34 Compare October 10, 2025 16:21

evelez7 approved these changes Oct 20, 2025

View reviewed changes

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[llvm][mustache] Use single pass when tokenizing #159196

[llvm][mustache] Use single pass when tokenizing #159196

Uh oh!

ilovepi commented Sep 16, 2025 •

edited

Loading

Uh oh!

ilovepi commented Sep 16, 2025 •

edited

Loading

Uh oh!

llvmbot commented Sep 16, 2025

Uh oh!

evelez7 left a comment

Uh oh!

evelez7 Oct 20, 2025

Uh oh!

evelez7 Oct 20, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

4 participants

[llvm][mustache] Use single pass when tokenizing #159196

Are you sure you want to change the base?

[llvm][mustache] Use single pass when tokenizing #159196

Uh oh!

Conversation

ilovepi commented Sep 16, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

ilovepi commented Sep 16, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Sep 16, 2025

Uh oh!

evelez7 left a comment

Choose a reason for hiding this comment

Uh oh!

evelez7 Oct 20, 2025

Choose a reason for hiding this comment

Uh oh!

evelez7 Oct 20, 2025

Choose a reason for hiding this comment

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

4 participants

ilovepi commented Sep 16, 2025 •

edited

Loading

ilovepi commented Sep 16, 2025 •

edited

Loading