Add Ripper :on_sp events for Prism.lex_compat and Prism::Translation::Ripper

eregon · eregon · commit 97fa55fc0169 · 2026-01-19T13:44:02.000+01:00
diff --git a/lib/prism.rb b/lib/prism.rb
@@ -61,8 +61,7 @@ def initialize(version)
   #   Prism::lex_compat(source, **options) -> LexCompat::Result
   #
   # Returns a parse result whose value is an array of tokens that closely
-  # resembles the return value of Ripper::lex. The main difference is that the
-  # `:on_sp` token is not emitted.
+  # resembles the return value of Ripper::lex.
   #
   # For supported options, see Prism::parse.
   def self.lex_compat(source, **options)
diff --git a/lib/prism/lex_compat.rb b/lib/prism/lex_compat.rb
@@ -611,10 +611,10 @@ def self.build(opening)
     BOM_FLUSHED = RUBY_VERSION >= "3.3.0"
     private_constant :BOM_FLUSHED
 
-    attr_reader :source, :options
+    attr_reader :code, :options
 
-    def initialize(source, **options)
-      @source = source
+    def initialize(code, **options)
+      @code = code
       @options = options
     end
 
@@ -624,12 +624,13 @@ def result
       state = :default
       heredoc_stack = [[]] #: Array[Array[Heredoc::PlainHeredoc | Heredoc::DashHeredoc | Heredoc::DedentingHeredoc]]
 
-      result = Prism.lex(source, **options)
+      result = Prism.lex(code, **options)
+      @source = result.source
       result_value = result.value
       previous_state = nil #: State?
       last_heredoc_end = nil #: Integer?
 
-      bom = source.byteslice(0..2) == "\xEF\xBB\xBF"
+      bom = code.byteslice(0..2) == "\xEF\xBB\xBF"
 
       result_value.each_with_index do |(token, lex_state), index|
         lineno = token.location.start_line
@@ -763,7 +764,7 @@ def result
                   end_offset += 3
                 end
 
-                tokens << Token.new([[lineno, 0], :on_nl, source.byteslice(start_offset...end_offset), lex_state])
+                tokens << Token.new([[lineno, 0], :on_nl, code.byteslice(start_offset...end_offset), lex_state])
               end
             end
 
@@ -857,7 +858,39 @@ def result
       # We sort by location to compare against Ripper's output
       tokens.sort_by!(&:location)
 
-      Result.new(tokens, result.comments, result.magic_comments, result.data_loc, result.errors, result.warnings, Source.for(source))
+      # Add :on_sp tokens
+      tokens = add_on_sp_tokens(tokens)
+
+      Result.new(tokens, result.comments, result.magic_comments, result.data_loc, result.errors, result.warnings, @source)
+    end
+
+    def add_on_sp_tokens(tokens)
+      new_tokens = []
+
+      last_token_state = Translation::Ripper::Lexer::State.new(Translation::Ripper::EXPR_BEG)
+      last_token_end = 0
+
+      tokens.each do |token|
+        line, column = token.location
+        start_offset = @source.line_to_byte_offset(line) + column
+        if start_offset > last_token_end
+          new_tokens << Token.new([
+            [
+              @source.line(last_token_end),
+              @source.column(last_token_end),
+            ],
+            :on_sp,
+            @source.slice(last_token_end, start_offset - last_token_end),
+            last_token_state
+          ])
+        end
+        new_tokens << token
+
+        last_token_state = token.state
+        last_token_end = start_offset + token.value.bytesize
+      end
+
+      new_tokens
     end
   end
 
diff --git a/lib/prism/lex_ripper.rb b/lib/prism/lex_ripper.rb
@@ -19,8 +19,6 @@ def result
 
       lex(source).each do |token|
         case token[1]
-        when :on_sp
-          # skip
         when :on_tstring_content
           if previous[1] == :on_tstring_content && (token[2].start_with?("\#$") || token[2].start_with?("\#@"))
             previous[2] << token[2]
diff --git a/test/prism/ruby/ripper_test.rb b/test/prism/ruby/ripper_test.rb
@@ -74,6 +74,10 @@ class RipperTest < TestCase
       "strings.txt",
       "whitequark/dedenting_heredoc.txt",
       "whitequark/procarg0.txt",
+      "dos_endings.txt",
+      "seattlerb/str_lit_concat_bad_encodings.txt",
+      "seattlerb/utf8_bom.txt",
+      "unparser/corpus/semantic/dstr.txt",
     ]
 
     Fixture.each_for_current_ruby(except: incorrect | omitted_sexp_raw) do |fixture|
@@ -92,7 +96,7 @@ def test_lexer
       assert_equal(expected, lexer.parse[0].to_a)
       assert_equal(lexer.parse[0].to_a, lexer.scan[0].to_a)
 
-      assert_equal(%i[on_int on_op], Translation::Ripper::Lexer.new("1 +").lex.map(&:event))
+      assert_equal(%i[on_int on_sp on_op], Translation::Ripper::Lexer.new("1 +").lex.map(&:event))
       assert_raise(SyntaxError) { Translation::Ripper::Lexer.new("1 +").lex(raise_errors: true) }
     end
 
@@ -121,12 +125,18 @@ def assert_ripper_sexp_raw(source)
     def assert_ripper_lex(source)
       prism = Translation::Ripper.lex(source)
       ripper = Ripper.lex(source)
-      ripper.reject! { |elem| elem[1] == :on_sp } # Prism doesn't emit on_sp
+
       ripper.sort_by! { |elem| elem[0] } # Prism emits tokens by their order in the code, not in parse order
 
       [prism.size, ripper.size].max.times do |i|
         expected = ripper[i]
         actual = prism[i]
+
+        # Since :on_sp tokens are synthesized on Prism, their state doesn't always line up.
+        if expected[1] == :on_sp && actual[1] == :on_sp
+          expected[3] = actual[3] = nil
+        end
+
         # Since tokens related to heredocs are not emitted in the same order,
         # the state also doesn't line up.
         if expected[1] == :on_heredoc_end && actual[1] == :on_heredoc_end

Original file line number	Diff line number	Diff line change
`@@ -61,8 +61,7 @@ def initialize(version)`
`61`	`61`	`# Prism::lex_compat(source, **options) -> LexCompat::Result`
`62`	`62`	`#`
`63`	`63`	`# Returns a parse result whose value is an array of tokens that closely`
`64`		`- # resembles the return value of Ripper::lex. The main difference is that the`
`65`		- # `:on_sp` token is not emitted.
	`64`	`+ # resembles the return value of Ripper::lex.`
`66`	`65`	`#`
`67`	`66`	`# For supported options, see Prism::parse.`
`68`	`67`	`def self.lex_compat(source, **options)`