Skip to content
This repository has been archived by the owner on Jun 24, 2024. It is now read-only.

fix(llama): buffer tokens until valid UTF-8 #122

Merged
merged 2 commits into from
Apr 13, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Address review feedback
  • Loading branch information
philpax committed Apr 13, 2023
commit 6b1488f655a65c7d5be244efc456ba011e017072
2 changes: 1 addition & 1 deletion ggml/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -406,7 +406,7 @@ impl Tensor {
}
}

fn with_alive_ctx<U>(&self, f: impl Fn() -> U) -> U {
fn with_alive_ctx<U>(&self, mut f: impl FnMut() -> U) -> U {
if let Some(_ctx) = self.ctx.upgrade() {
f()
} else {
Expand Down
3 changes: 0 additions & 3 deletions llama-rs/src/convert.rs
Original file line number Diff line number Diff line change
Expand Up @@ -129,9 +129,6 @@ fn write_header(fout: &mut File, hparams: &Hyperparameters) -> Result<(), String
fn write_tokens(file: &mut File, vocab: &Vocabulary) -> Result<(), String> {
let mut values: Vec<u8> = vec![];
for (i, token) in vocab.id_to_token.iter().enumerate() {
// TODO: Not sure what the behaviour should be if the token is not valid UTF-8.
//
// Switching to the HF tokenizer should fix this.
let text = if let Ok(token) = std::str::from_utf8(token) {
match token {
_ if token.contains("<unk>") => " \u{2047} ".as_bytes().to_vec(),
Expand Down
45 changes: 43 additions & 2 deletions llama-rs/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -570,7 +570,7 @@ impl Model {
pub fn load(
path: impl AsRef<Path>,
n_context_tokens: usize,
load_progress_callback: impl Fn(LoadProgress),
mut load_progress_callback: impl FnMut(LoadProgress),
) -> Result<(Model, Vocabulary), LoadError> {
use std::fs::File;
use std::io::BufReader;
Expand Down Expand Up @@ -1768,7 +1768,21 @@ impl TokenUtf8Buffer {
self.0 = vec![];
Some(out)
}
Err(..) => None,
Err(..) => {
for i in 1..self.0.len() {
let slice = &self.0[i..];
if slice.is_empty() {
break;
}

if let Ok(s) = std::str::from_utf8(slice) {
let out = s.to_owned();
self.0 = vec![];
return Some(out);
}
}
None
}
}
}

Expand All @@ -1783,3 +1797,30 @@ impl TokenUtf8Buffer {
}
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_valid_utf8() {
let mut buffer = TokenUtf8Buffer::new();
assert_eq!(buffer.push(b"hello").as_deref(), Some("hello"));
assert_eq!(buffer.push(&[0xE2, 0x82, 0xAC]).as_deref(), Some("€"));
}

#[test]
fn test_partial_utf8() {
let mut buffer = TokenUtf8Buffer::new();
assert_eq!(buffer.push(&[0xE2, 0x82]).as_deref(), None);
assert_eq!(buffer.push(&[0xAC]).as_deref(), Some("€"));
}

#[test]
fn test_invalid_prelude_for_valid_utf8() {
let mut buffer = TokenUtf8Buffer::new();
assert_eq!(buffer.push(&[0xD8]).as_deref(), None);
assert_eq!(buffer.push(&[0xE2, 0x82]).as_deref(), None);
assert_eq!(buffer.push(&[0xAC]).as_deref(), Some("€"));
}
}