Skip to content

Commit 97fa55f

Browse files
committed
Add Ripper :on_sp events for Prism.lex_compat and Prism::Translation::Ripper
1 parent 8a43af1 commit 97fa55f

File tree

4 files changed

+53
-13
lines changed

4 files changed

+53
-13
lines changed

lib/prism.rb

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,8 +61,7 @@ def initialize(version)
6161
# Prism::lex_compat(source, **options) -> LexCompat::Result
6262
#
6363
# Returns a parse result whose value is an array of tokens that closely
64-
# resembles the return value of Ripper::lex. The main difference is that the
65-
# `:on_sp` token is not emitted.
64+
# resembles the return value of Ripper::lex.
6665
#
6766
# For supported options, see Prism::parse.
6867
def self.lex_compat(source, **options)

lib/prism/lex_compat.rb

Lines changed: 40 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -611,10 +611,10 @@ def self.build(opening)
611611
BOM_FLUSHED = RUBY_VERSION >= "3.3.0"
612612
private_constant :BOM_FLUSHED
613613

614-
attr_reader :source, :options
614+
attr_reader :code, :options
615615

616-
def initialize(source, **options)
617-
@source = source
616+
def initialize(code, **options)
617+
@code = code
618618
@options = options
619619
end
620620

@@ -624,12 +624,13 @@ def result
624624
state = :default
625625
heredoc_stack = [[]] #: Array[Array[Heredoc::PlainHeredoc | Heredoc::DashHeredoc | Heredoc::DedentingHeredoc]]
626626

627-
result = Prism.lex(source, **options)
627+
result = Prism.lex(code, **options)
628+
@source = result.source
628629
result_value = result.value
629630
previous_state = nil #: State?
630631
last_heredoc_end = nil #: Integer?
631632

632-
bom = source.byteslice(0..2) == "\xEF\xBB\xBF"
633+
bom = code.byteslice(0..2) == "\xEF\xBB\xBF"
633634

634635
result_value.each_with_index do |(token, lex_state), index|
635636
lineno = token.location.start_line
@@ -763,7 +764,7 @@ def result
763764
end_offset += 3
764765
end
765766

766-
tokens << Token.new([[lineno, 0], :on_nl, source.byteslice(start_offset...end_offset), lex_state])
767+
tokens << Token.new([[lineno, 0], :on_nl, code.byteslice(start_offset...end_offset), lex_state])
767768
end
768769
end
769770

@@ -857,7 +858,39 @@ def result
857858
# We sort by location to compare against Ripper's output
858859
tokens.sort_by!(&:location)
859860

860-
Result.new(tokens, result.comments, result.magic_comments, result.data_loc, result.errors, result.warnings, Source.for(source))
861+
# Add :on_sp tokens
862+
tokens = add_on_sp_tokens(tokens)
863+
864+
Result.new(tokens, result.comments, result.magic_comments, result.data_loc, result.errors, result.warnings, @source)
865+
end
866+
867+
def add_on_sp_tokens(tokens)
868+
new_tokens = []
869+
870+
last_token_state = Translation::Ripper::Lexer::State.new(Translation::Ripper::EXPR_BEG)
871+
last_token_end = 0
872+
873+
tokens.each do |token|
874+
line, column = token.location
875+
start_offset = @source.line_to_byte_offset(line) + column
876+
if start_offset > last_token_end
877+
new_tokens << Token.new([
878+
[
879+
@source.line(last_token_end),
880+
@source.column(last_token_end),
881+
],
882+
:on_sp,
883+
@source.slice(last_token_end, start_offset - last_token_end),
884+
last_token_state
885+
])
886+
end
887+
new_tokens << token
888+
889+
last_token_state = token.state
890+
last_token_end = start_offset + token.value.bytesize
891+
end
892+
893+
new_tokens
861894
end
862895
end
863896

lib/prism/lex_ripper.rb

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,6 @@ def result
1919

2020
lex(source).each do |token|
2121
case token[1]
22-
when :on_sp
23-
# skip
2422
when :on_tstring_content
2523
if previous[1] == :on_tstring_content && (token[2].start_with?("\#$") || token[2].start_with?("\#@"))
2624
previous[2] << token[2]

test/prism/ruby/ripper_test.rb

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,10 @@ class RipperTest < TestCase
7474
"strings.txt",
7575
"whitequark/dedenting_heredoc.txt",
7676
"whitequark/procarg0.txt",
77+
"dos_endings.txt",
78+
"seattlerb/str_lit_concat_bad_encodings.txt",
79+
"seattlerb/utf8_bom.txt",
80+
"unparser/corpus/semantic/dstr.txt",
7781
]
7882

7983
Fixture.each_for_current_ruby(except: incorrect | omitted_sexp_raw) do |fixture|
@@ -92,7 +96,7 @@ def test_lexer
9296
assert_equal(expected, lexer.parse[0].to_a)
9397
assert_equal(lexer.parse[0].to_a, lexer.scan[0].to_a)
9498

95-
assert_equal(%i[on_int on_op], Translation::Ripper::Lexer.new("1 +").lex.map(&:event))
99+
assert_equal(%i[on_int on_sp on_op], Translation::Ripper::Lexer.new("1 +").lex.map(&:event))
96100
assert_raise(SyntaxError) { Translation::Ripper::Lexer.new("1 +").lex(raise_errors: true) }
97101
end
98102

@@ -121,12 +125,18 @@ def assert_ripper_sexp_raw(source)
121125
def assert_ripper_lex(source)
122126
prism = Translation::Ripper.lex(source)
123127
ripper = Ripper.lex(source)
124-
ripper.reject! { |elem| elem[1] == :on_sp } # Prism doesn't emit on_sp
128+
125129
ripper.sort_by! { |elem| elem[0] } # Prism emits tokens by their order in the code, not in parse order
126130

127131
[prism.size, ripper.size].max.times do |i|
128132
expected = ripper[i]
129133
actual = prism[i]
134+
135+
# Since :on_sp tokens are synthesized on Prism, their state doesn't always line up.
136+
if expected[1] == :on_sp && actual[1] == :on_sp
137+
expected[3] = actual[3] = nil
138+
end
139+
130140
# Since tokens related to heredocs are not emitted in the same order,
131141
# the state also doesn't line up.
132142
if expected[1] == :on_heredoc_end && actual[1] == :on_heredoc_end

0 commit comments

Comments
 (0)