Skip to content

Commit

Permalink
Handle surrogate-pairs in LFNs.
Browse files Browse the repository at this point in the history
  • Loading branch information
thejpster committed Oct 27, 2024
1 parent c8cc20f commit b77b9ca
Showing 1 changed file with 88 additions and 9 deletions.
97 changes: 88 additions & 9 deletions src/filesystem/filename.rs
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,7 @@ impl core::fmt::Debug for ShortFileName {
}

/// Used to store a Long File Name
#[derive(Debug)]
pub struct LfnBuffer<'a> {
/// We fill this buffer in from the back
inner: &'a mut [u8],
Expand All @@ -230,6 +231,8 @@ pub struct LfnBuffer<'a> {
free: usize,
/// Did we overflow?
overflow: bool,
/// If a surrogate-pair is split over two directory entries, remember half of it here.
unpaired_surrogate: Option<u16>,
}

impl<'a> LfnBuffer<'a> {
Expand All @@ -240,19 +243,34 @@ impl<'a> LfnBuffer<'a> {
inner: storage,
free: len,
overflow: false,
unpaired_surrogate: None,
}
}

/// Empty out this buffer
pub fn clear(&mut self) {
self.free = self.inner.len();
self.overflow = false;
self.unpaired_surrogate = None;
}

/// Push the 13 UCS-2 characters into this string
/// Push the 13 UTF-16 codepoints into this string.
///
/// We assume they are pushed last-chunk-first, as you would find
/// them on disk.
///
/// Any chunk starting with a half of a surrogate pair has that saved for the next call.
///
/// ```text
/// [de00, 002e, 0074, 0078, 0074, 0000, ffff, ffff, ffff, ffff, ffff, ffff, ffff]
/// [0041, 0042, 0030, 0031, 0032, 0033, 0034, 0035, 0036, 0037, 0038, 0039, d83d]
///
/// Would map to
///
/// 0041 0042 0030 0031 0032 0033 0034 0035 0036 0037 0038 0039 1f600 002e 0074 0078 0074, or
///
/// "AB0123456789😀.txt"
/// ```
pub fn push(&mut self, buffer: &[u16; 13]) {
// find the first null, if any
let null_idx = buffer
Expand All @@ -261,25 +279,70 @@ impl<'a> LfnBuffer<'a> {
.unwrap_or(buffer.len());
// take all the wide chars, up to the null (or go to the end)
let buffer = &buffer[0..null_idx];
for ch in buffer.iter().rev() {
let ch = char::from_u32(*ch as u32).unwrap_or('?');

// This next part will convert the 16-bit values into chars, noting that
// chars outside the Basic Multilingual Plane will require two 16-bit
// values to encode (see UTF-16 Surrogate Pairs).
//
// We cache the decoded chars into this array so we can iterate them
// backwards. It's 60 bytes, but it'll have to do.
let mut char_vec: heapless::Vec<char, 13> = heapless::Vec::new();
// Now do the decode, including the unpaired surrogate (if any) from
// last time (maybe it has a pair now!)
let mut is_first = true;
for ch in char::decode_utf16(
buffer
.iter()
.cloned()
.chain(self.unpaired_surrogate.take().iter().cloned()),
) {
match ch {
Ok(ch) => {
char_vec.push(ch).expect("Vec was full!?");
}
Err(e) => {
// OK, so we found half a surrogate pair and nothing to go
// with it. Was this the first codepoint in the chunk?
if is_first {
// it was - the other half is probably in the next chunk
// so save this for next time
trace!("LFN saved {:?}", e.unpaired_surrogate());
self.unpaired_surrogate = Some(e.unpaired_surrogate());
} else {
// it wasn't - can't deal with it these mid-sequence, so
// replace it
trace!("LFN replaced {:?}", e.unpaired_surrogate());
char_vec.push('\u{fffd}').expect("Vec was full?!");
}
}
}
is_first = false;
}

for ch in char_vec.iter().rev() {
trace!("LFN push {:?}", ch);
let mut ch_bytes = [0u8; 4];
// a buffer of length 4 is always enough
let ch_str = ch.encode_utf8(&mut ch_bytes);
if self.free < ch_str.len() {
// a buffer of length 4 is enough to encode any char
let mut encoded_ch = [0u8; 4];
let encoded_ch = ch.encode_utf8(&mut encoded_ch);
if self.free < encoded_ch.len() {
// the LFN buffer they gave us was not long enough. Note for
// later, so we don't show them garbage.
self.overflow = true;
return;
}
// store the encoded character in the buffer, working backwards
for b in ch_str.bytes().rev() {
// Store the encoded char in the buffer, working backwards. We
// already checked there was enough space.
for b in encoded_ch.bytes().rev() {
self.free -= 1;
self.inner[self.free] = b;
}
}
}

/// View this LFN buffer as a string-slice
///
/// If the buffer overflowed while parsing the LFN, or if this buffer is
/// empty, you get an empty string.
pub fn as_str(&self) -> &str {
if self.overflow {
""
Expand Down Expand Up @@ -418,6 +481,22 @@ mod test {
]);
assert_eq!(buf.as_str(), "ABCDEFGHIJKLM0123∂");
}

#[test]
fn two_piece_split_surrogate() {
let mut storage = [0u8; 64];
let mut buf: LfnBuffer = LfnBuffer::new(&mut storage);

buf.push(&[
0xde00, 0x002e, 0x0074, 0x0078, 0x0074, 0x0000, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
0xffff, 0xffff,
]);
buf.push(&[
0xd83d, 0xde00, 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038,
0x0039, 0xd83d,
]);
assert_eq!(buf.as_str(), "😀0123456789😀.txt");
}
}

// ****************************************************************************
Expand Down

0 comments on commit b77b9ca

Please sign in to comment.