raise syntax error from lexer parser with utf8 character

Shopify · ggmichaelgo · Oct 30, 2024 · Oct 30, 2024 · Oct 30, 2024 · 8a9f33a060eafca7080cea4ae084194877e50ed5
commit 8a9f33a060eafca7080cea4ae084194877e50ed5
diff --git a/lib/liquid/lexer.rb b/lib/liquid/lexer.rb
@@ -171,6 +171,7 @@ def tokenize
 
         break if @ss.eos?
 
+        start_pos = @ss.pos
         peeked = @ss.peek_byte
 
         if (special = SPECIAL_TABLE[peeked])
@@ -196,7 +197,7 @@ def tokenize
             @output << found
             @ss.scan_byte
           else
-            raise SyntaxError, "Unexpected character #{peeked.chr}"
+            raise_syntax_error(start_pos)
           end
         elsif (sub_table = COMPARISON_JUMP_TABLE[peeked])
           @ss.scan_byte
@@ -217,14 +218,20 @@ def tokenize
               [type, t]
             end
           else
-            raise SyntaxError, "Unexpected character #{peeked.chr}"
+            raise_syntax_error(start_pos)
           end
         end
       end
       # rubocop:enable Metrics/BlockNesting
 
       @output << EOS
     end
+
+    def raise_syntax_error(start_pos)
+      @ss.pos = start_pos
+      # the character could be a UTF-8 character, use getch to get all the bytes
+      raise SyntaxError, "Unexpected character #{@ss.getch}"
+    end
   end
 
   Lexer = StringScanner.instance_methods.include?(:scan_byte) ? Lexer2 : Lexer1

diff --git a/test/unit/lexer_unit_test.rb b/test/unit/lexer_unit_test.rb
@@ -84,4 +84,15 @@ def test_greater_than_two_digits
     tokens = Lexer.new("foo > 12").tokenize
     assert_equal([[:id, 'foo'], [:comparison, '>'], [:number, '12'], [:end_of_string]], tokens)
   end
+
+  def test_error_with_utf8_character
+    error = assert_raises(SyntaxError) do
+      Lexer.new("1 < 1Ø").tokenize
+    end
+
+    assert_equal(
+      'Liquid syntax error: Unexpected character Ø',
+      error.message,
+    )
+  end
 end