Skip to content

Commit 1ab35c4

Browse files
Copilotsapphi-red
andcommitted
Address PR review feedback: improve URI constant optimization and fix invalid input handling
Co-authored-by: sapphi-red <49056869+sapphi-red@users.noreply.github.com>
1 parent 5b3ae2a commit 1ab35c4

File tree

1 file changed

+127
-68
lines changed

1 file changed

+127
-68
lines changed

crates/oxc_ecmascript/src/constant_evaluation/call_expr.rs

Lines changed: 127 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@ use crate::{
2121

2222
use super::{ConstantEvaluation, ConstantEvaluationCtx, ConstantValue};
2323

24-
// Characters that are always unescaped in URI encoding: A-Z a-z 0-9 - _ . ! ~ * ' ( )
25-
const URI_ALWAYS_UNESCAPED: &str = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_.!~*'()";
24+
// Characters that are always unescaped in URI encoding: A-Z a-z 0-9 _ - . ! ~ * ' ( )
25+
const URI_ALWAYS_UNESCAPED: &str = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_-.!~*'()";
2626

2727
fn is_uri_always_unescaped(c: char) -> bool {
2828
URI_ALWAYS_UNESCAPED.contains(c)
@@ -50,10 +50,11 @@ pub fn try_fold_known_global_methods<'a>(
5050
arguments: &Vec<'a, Argument<'a>>,
5151
ctx: &impl ConstantEvaluationCtx<'a>,
5252
) -> Option<ConstantValue<'a>> {
53-
// Handle global function calls (e.g., encodeURI, decodeURI)
5453
if let Expression::Identifier(ident) = callee {
55-
if let Some(result) = try_fold_url_related_function(ident, arguments, ctx) {
56-
return Some(result);
54+
if matches!(ident.name.as_str(), "encodeURI" | "encodeURIComponent" | "decodeURI" | "decodeURIComponent") {
55+
if let Some(result) = try_fold_url_related_function(ident, arguments, ctx) {
56+
return Some(result);
57+
}
5758
}
5859
}
5960

@@ -550,13 +551,13 @@ fn try_fold_encode_uri<'a>(
550551
fn should_encode_uri(c: char) -> bool {
551552
match c {
552553
c if is_uri_always_unescaped(c) => false,
553-
';' | ',' | '/' | '?' | ':' | '@' | '&' | '=' | '+' | '$' | '#' => false,
554+
';' | '/' | '?' | ':' | '@' | '&' | '=' | '+' | '$' | ',' | '#' => false,
554555
_ => true,
555556
}
556557
}
557558

558559
let encoded = encode_uri_chars(&string_value, should_encode_uri);
559-
Some(ConstantValue::String(Cow::Owned(encoded)))
560+
Some(ConstantValue::String(encoded.into_owned().into()))
560561
}
561562

562563
fn try_fold_encode_uri_component<'a>(
@@ -575,7 +576,7 @@ fn try_fold_encode_uri_component<'a>(
575576
}
576577

577578
let encoded = encode_uri_chars(&string_value, should_encode_uri_component);
578-
Some(ConstantValue::String(Cow::Owned(encoded)))
579+
Some(ConstantValue::String(encoded.into_owned().into()))
579580
}
580581

581582
fn try_fold_decode_uri<'a>(
@@ -597,7 +598,7 @@ fn try_fold_decode_uri<'a>(
597598
}
598599

599600
let decoded = decode_uri_chars(&string_value, should_not_decode_uri)?;
600-
Some(ConstantValue::String(Cow::Owned(decoded)))
601+
Some(ConstantValue::String(decoded.into_owned().into()))
601602
}
602603

603604
fn try_fold_decode_uri_component<'a>(
@@ -613,23 +614,31 @@ fn try_fold_decode_uri_component<'a>(
613614

614615
// decodeURIComponent decodes all percent-encoded sequences
615616
let decoded = decode_uri_chars(&string_value, |_| false)?;
616-
Some(ConstantValue::String(Cow::Owned(decoded)))
617+
Some(ConstantValue::String(decoded.into_owned().into()))
617618
}
618619

619620
// this function is based on https://github.com/kornelski/rust_urlencoding/blob/a617c89d16f390e3ab4281ea68c514660b111301/src/enc.rs#L67
620621
// MIT license: https://github.com/kornelski/rust_urlencoding/blob/a617c89d16f390e3ab4281ea68c514660b111301/LICENSE
621-
fn encode_uri_chars(input: &str, should_encode: fn(char) -> bool) -> String {
622-
let mut result = String::new();
623-
let mut bytes = input.bytes();
624-
while let Some(byte) = bytes.next() {
625-
let ch = byte as char;
626-
if byte < 128 && !should_encode(ch) {
627-
result.push(ch);
628-
} else {
629-
result.push_str(&format!("%{:02X}", byte));
622+
fn encode_uri_chars<'a>(input: &'a str, should_encode: fn(char) -> bool) -> Cow<'a, str> {
623+
let mut out = std::vec::Vec::with_capacity(input.len());
624+
let bytes = input.as_bytes();
625+
let mut start = 0;
626+
627+
for (i, &byte) in bytes.iter().enumerate() {
628+
if byte >= 128 || should_encode(byte as char) {
629+
out.extend_from_slice(&bytes[start..i]);
630+
out.extend_from_slice(format!("%{:02X}", byte).as_bytes());
631+
start = i + 1;
630632
}
631633
}
632-
result
634+
635+
if start == 0 {
636+
Cow::Borrowed(input)
637+
} else {
638+
out.extend_from_slice(&bytes[start..]);
639+
// SAFETY: we started with a valid UTF-8 string and only changed ASCII characters
640+
Cow::Owned(unsafe { String::from_utf8_unchecked(out) })
641+
}
633642
}
634643

635644
// this function is based on https://github.com/kornelski/rust_urlencoding/blob/a617c89d16f390e3ab4281ea68c514660b111301/src/dec.rs#L21
@@ -643,64 +652,114 @@ fn from_hex_digit(digit: u8) -> Option<u8> {
643652
}
644653
}
645654

646-
fn decode_uri_chars(input: &str, should_not_decode: fn(char) -> bool) -> Option<String> {
647-
let mut out = std::vec::Vec::with_capacity(input.len());
648-
let mut bytes = input.bytes();
649-
while let Some(byte) = bytes.next() {
655+
// this function is based on https://github.com/kornelski/rust_urlencoding/blob/a617c89d16f390e3ab4281ea68c514660b111301/src/dec.rs#L21
656+
// MIT license: https://github.com/kornelski/rust_urlencoding/blob/a617c89d16f390e3ab4281ea68c514660b111301/LICENSE
657+
fn decode_uri_chars<'a>(input: &'a str, should_not_decode: fn(char) -> bool) -> Option<Cow<'a, str>> {
658+
let mut out = std::vec::Vec::new();
659+
let mut has_changes = false;
660+
let input_bytes = input.as_bytes();
661+
let mut i = 0;
662+
663+
while i < input_bytes.len() {
664+
let byte = input_bytes[i];
650665
if byte == b'%' {
651-
let h1 = bytes.next()?;
652-
let h2 = bytes.next()?;
653-
let decoded_byte = from_hex_digit(h1)? * 16 + from_hex_digit(h2)?;
666+
// Check if we have enough characters for a percent-encoded sequence
667+
if i + 2 >= input_bytes.len() {
668+
// Invalid: % at end of string or incomplete sequence
669+
return None;
670+
}
654671

655-
// For ASCII characters, check if we should decode them
656-
if decoded_byte < 0x80 {
657-
let decoded_char = decoded_byte as char;
658-
if should_not_decode(decoded_char) {
659-
// Keep the original percent-encoded form
660-
out.push(b'%');
661-
out.push(h1);
662-
out.push(h2);
663-
} else {
664-
out.push(decoded_byte);
665-
}
666-
} else {
667-
// For multi-byte UTF-8 sequences, we need to collect all bytes
668-
let mut utf8_bytes = vec![decoded_byte];
669-
let expected_bytes = if decoded_byte >= 0xF0 {
670-
4
671-
} else if decoded_byte >= 0xE0 {
672-
3
672+
let h1 = input_bytes[i + 1];
673+
let h2 = input_bytes[i + 2];
674+
675+
if let (Some(d1), Some(d2)) = (from_hex_digit(h1), from_hex_digit(h2)) {
676+
let decoded_byte = d1 * 16 + d2;
677+
678+
if decoded_byte < 0x80 {
679+
let decoded_char = decoded_byte as char;
680+
if should_not_decode(decoded_char) {
681+
// Keep the original percent-encoded form
682+
if !has_changes {
683+
has_changes = true;
684+
out.extend_from_slice(&input_bytes[0..i + 3]);
685+
} else {
686+
out.push(b'%');
687+
out.push(h1);
688+
out.push(h2);
689+
}
690+
i += 3;
691+
continue;
692+
} else {
693+
// Decode the character
694+
if !has_changes {
695+
has_changes = true;
696+
out.extend_from_slice(&input_bytes[0..i]);
697+
}
698+
out.push(decoded_byte);
699+
i += 3;
700+
continue;
701+
}
673702
} else {
674-
2
675-
};
676-
677-
for _ in 1..expected_bytes {
678-
if bytes.next() != Some(b'%') {
703+
// Multi-byte UTF-8 handling
704+
let expected_bytes = if decoded_byte >= 0xF0 {
705+
4
706+
} else if decoded_byte >= 0xE0 {
707+
3
708+
} else {
709+
2
710+
};
711+
712+
if i + expected_bytes * 3 > input_bytes.len() {
679713
return None;
680714
}
681-
let h1 = bytes.next()?;
682-
let h2 = bytes.next()?;
683-
let byte = from_hex_digit(h1)? * 16 + from_hex_digit(h2)?;
684-
utf8_bytes.push(byte);
685-
}
686-
687-
// Convert bytes to string and check if we should decode
688-
let decoded_str = String::from_utf8(utf8_bytes).ok()?;
689-
if let Some(decoded_char) = decoded_str.chars().next() {
690-
if should_not_decode(decoded_char) {
691-
// For multi-byte sequences that shouldn't be decoded,
692-
// we conservatively don't fold to avoid complexity
693-
return None;
715+
716+
let mut utf8_bytes = vec![decoded_byte];
717+
let mut byte_index = i + 3;
718+
719+
for _ in 1..expected_bytes {
720+
if input_bytes[byte_index] != b'%' {
721+
return None;
722+
}
723+
let h1 = input_bytes[byte_index + 1];
724+
let h2 = input_bytes[byte_index + 2];
725+
let byte = from_hex_digit(h1)? * 16 + from_hex_digit(h2)?;
726+
utf8_bytes.push(byte);
727+
byte_index += 3;
728+
}
729+
730+
let decoded_str = String::from_utf8(utf8_bytes).ok()?;
731+
if let Some(decoded_char) = decoded_str.chars().next() {
732+
if should_not_decode(decoded_char) {
733+
return None;
734+
} else {
735+
if !has_changes {
736+
has_changes = true;
737+
out.extend_from_slice(&input_bytes[0..i]);
738+
}
739+
out.extend_from_slice(decoded_str.as_bytes());
740+
i = byte_index;
741+
continue;
742+
}
694743
} else {
695-
out.extend_from_slice(decoded_str.as_bytes());
744+
return None;
696745
}
697-
} else {
698-
return None;
699746
}
747+
} else {
748+
// Invalid hex digits - this should cause a URIError at runtime
749+
return None;
700750
}
701751
} else {
702-
out.push(byte);
752+
// Regular character, copy if we're tracking changes
753+
if has_changes {
754+
out.push(byte);
755+
}
756+
i += 1;
703757
}
704758
}
705-
String::from_utf8(out).ok()
759+
760+
if has_changes {
761+
Some(Cow::Owned(String::from_utf8(out).ok()?))
762+
} else {
763+
Some(Cow::Borrowed(input))
764+
}
706765
}

0 commit comments

Comments
 (0)