Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion llama-cpp-2/src/model.rs
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,10 @@ pub enum AddBos {
pub enum Special {
/// Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext. Does not insert a leading space.
Tokenize,
/// Allow tokenizing special and/or control tokens but excludes `bos` and `eos` tokens from the output
///
/// This variant was introduced as a compatiblity flag to address: https://github.com/utilityai/llama-cpp-rs/issues/826
ExcludeBosAndEos,
/// Treat special and/or control tokens as plaintext.
Plaintext,
}
Expand Down Expand Up @@ -395,14 +399,19 @@ impl LlamaModel {
if attrs.is_empty()
|| attrs
.intersects(LlamaTokenAttr::Unknown | LlamaTokenAttr::Byte | LlamaTokenAttr::Unused)
// the following exclusion of control characters stems from a requirement of the original purpose of this project see
// https://github.com/utilityai/llama-cpp-rs/issues/826#issuecomment-3478624072. But it should not be the default behavior.
// given that llama.cpp [documentation](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.llama_cpp.llama_token_to_piece)
// states that `special` controls where specital tokens are rendered we can use it as a gate to this feature as well.
|| attrs.contains(LlamaTokenAttr::Control)
&& (token == self.token_bos() || token == self.token_eos())
&& (token == self.token_bos() || token == self.token_eos()) && special == Special::ExcludeBosAndEos
{
return Ok(Vec::new());
}

let special = match special {
Special::Tokenize => true,
Special::ExcludeBosAndEos => true,
Special::Plaintext => false,
};

Expand Down
Loading