Skip to content

Commit

Permalink
Fixing the progressbar. (huggingface#1353)
Browse files Browse the repository at this point in the history
* Fixing the progressbar.

* Upgrade deps.

* Update cargo audit

* Ssh this action.

* Fixing esaxx by using slower rust version.

* Trying the new esaxx version.

* Publish.

* Get cache again.
  • Loading branch information
Narsil authored Oct 5, 2023
1 parent 7e8e69a commit aed491d
Show file tree
Hide file tree
Showing 9 changed files with 50 additions and 57 deletions.
6 changes: 2 additions & 4 deletions .github/workflows/python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ jobs:
os: [ubuntu-latest, macos-latest]
steps:
- name: Checkout repository
uses: actions/checkout@v1
uses: actions/checkout@v2


- name: Install Rust
Expand Down Expand Up @@ -99,9 +99,7 @@ jobs:
uses: actions-rs/cargo@v1
with:
command: audit
# ignoring specific CVE which probably isn't affecting this crate
# https://github.com/chronotope/chrono/issues/602
args: -D warnings -f ./bindings/python/Cargo.lock --ignore RUSTSEC-2020-0071 --ignore RUSTSEC-2021-0145
args: -D warnings -f ./bindings/python/Cargo.lock

- name: Install
working-directory: ./bindings/python
Expand Down
4 changes: 1 addition & 3 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,7 @@ jobs:
uses: actions-rs/cargo@v1
with:
command: audit
# ignoring specific CVE which probably isn't affecting this crate
# https://github.com/chronotope/chrono/issues/602
args: -D warnings -f ./tokenizers/Cargo.lock --ignore RUSTSEC-2020-0071 --ignore RUSTSEC-2021-0145
args: -D warnings -f ./tokenizers/Cargo.lock

# Verify that Readme.md is up to date.
- name: Make sure, Readme generated from lib.rs matches actual Readme
Expand Down
14 changes: 6 additions & 8 deletions bindings/python/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,25 +9,23 @@ name = "tokenizers"
crate-type = ["cdylib"]

[dependencies]
rayon = "1.3"
rayon = "1.8"
serde = { version = "1.0", features = [ "rc", "derive" ]}
serde_json = "1.0"
libc = "0.2"
env_logger = "0.7.1"
env_logger = "0.10.0"
pyo3 = { version = "0.19" }
numpy = "0.19.0"
ndarray = "0.13"
onig = { version = "6.0", default-features = false }
itertools = "0.9"
ndarray = "0.15"
onig = { version = "6.4", default-features = false }
itertools = "0.11"

[dependencies.tokenizers]
version = "0.14.1-dev.0"
path = "../../tokenizers"
default-features = false
features = ["onig"]

[dev-dependencies]
tempfile = "3.1"
tempfile = "3.8"
pyo3 = { version = "0.19", features = ["auto-initialize"] }

[features]
Expand Down
36 changes: 18 additions & 18 deletions tokenizers/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -45,31 +45,31 @@ harness = false
[dependencies]
lazy_static = "1.4"
rand = "0.8"
onig = { version = "6.0", default-features = false, optional = true }
regex = "1.8"
onig = { version = "6.4", default-features = false, optional = true }
regex = "1.9"
regex-syntax = "0.7"
rayon = "1.7"
rayon-cond = "0.1"
rayon = "1.8"
rayon-cond = "0.3"
serde = { version = "1.0", features = [ "derive" ] }
serde_json = "1.0"
clap = { version = "4.0", features=["derive"], optional = true }
clap = { version = "4.4", features=["derive"], optional = true }
unicode-normalization-alignments = "0.1"
unicode_categories = "0.1"
unicode-segmentation = "1.10"
indicatif = {version = "0.15", optional = true}
itertools = "0.9"
indicatif = {version = "0.17", optional = true}
itertools = "0.11"
log = "0.4"
derive_builder = "0.12"
spm_precompiled = "0.1"
hf-hub = { version = "0.2.0", optional = true }
aho-corasick = "0.7"
paste = "1.0.6"
macro_rules_attribute = "0.1.2"
thiserror = "1.0.30"
fancy-regex = { version = "0.10", optional = true}
getrandom = { version = "0.2.6" }
esaxx-rs = { version = "0.1", default-features = false, features=[]}
monostate = "0.1.5"
hf-hub = { version = "0.3.2", optional = true }
aho-corasick = "1.1"
paste = "1.0.14"
macro_rules_attribute = "0.2.0"
thiserror = "1.0.49"
fancy-regex = { version = "0.11", optional = true}
getrandom = { version = "0.2.10" }
esaxx-rs = { version = "0.1.10", default-features = false, features=[]}
monostate = "0.1.9"

[features]
default = ["progressbar", "cli", "onig", "esaxx_fast"]
Expand All @@ -80,8 +80,8 @@ cli = ["clap"]
unstable_wasm = ["fancy-regex", "getrandom/js"]

[dev-dependencies]
criterion = "0.4"
tempfile = "3.1"
criterion = "0.5"
tempfile = "3.8"
assert_approx_eq = "1.1"

[profile.release]
Expand Down
6 changes: 3 additions & 3 deletions tokenizers/src/models/bpe/trainer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,8 @@ impl BpeTrainer {
let p = ProgressBar::new(0);
p.set_style(
ProgressStyle::default_bar()
.template("[{elapsed_precise}] {msg:<40!} {wide_bar} {pos:<9!}/{len:>9!}"),
.template("[{elapsed_precise}] {msg:<30!} {wide_bar} {pos:<9!}/{len:>9!}")
.expect("Invalid progress template"),
);
Some(p)
} else {
Expand All @@ -241,11 +242,10 @@ impl BpeTrainer {
}

/// Update the progress bar with the new provided length and message
fn update_progress(&self, p: &Option<ProgressBar>, len: usize, message: &str) {
fn update_progress(&self, p: &Option<ProgressBar>, len: usize, message: &'static str) {
if let Some(p) = p {
p.set_message(message);
p.set_length(len as u64);
p.set_draw_delta(len as u64 / 100);
p.reset();
}
}
Expand Down
6 changes: 3 additions & 3 deletions tokenizers/src/models/unigram/trainer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,8 @@ impl UnigramTrainer {
let p = ProgressBar::new(0);
p.set_style(
ProgressStyle::default_bar()
.template("[{elapsed_precise}] {msg:<40!} {wide_bar} {pos:<9!}/{len:>9!}"),
.template("[{elapsed_precise}] {msg:<30!} {wide_bar} {pos:<9!}/{len:>9!}")
.expect("Invalid progress template"),
);
Some(p)
} else {
Expand Down Expand Up @@ -431,11 +432,10 @@ impl UnigramTrainer {
}

/// Update the progress bar with the new provided length and message
fn update_progress(&self, p: &Option<ProgressBar>, len: usize, message: &str) {
fn update_progress(&self, p: &Option<ProgressBar>, len: usize, message: &'static str) {
if let Some(p) = p {
p.set_message(message);
p.set_length(len as u64);
p.set_draw_delta(len as u64 / 100);
p.reset();
}
}
Expand Down
12 changes: 8 additions & 4 deletions tokenizers/src/tokenizer/added_vocabulary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -166,10 +166,12 @@ impl AddedVocabulary {
pub fn new() -> Self {
let trie = AhoCorasickBuilder::new()
.match_kind(MatchKind::LeftmostLongest)
.build::<_, &&[u8]>([]);
.build::<_, &&[u8]>([])
.expect("The trie should build correctly");
let normalized_trie = AhoCorasickBuilder::new()
.match_kind(MatchKind::LeftmostLongest)
.build::<_, &&[u8]>([]);
.build::<_, &&[u8]>([])
.expect("The normalized trie should build correctly");
Self {
added_tokens_map: HashMap::new(),
added_tokens_map_r: HashMap::new(),
Expand Down Expand Up @@ -314,7 +316,8 @@ impl AddedVocabulary {
let (tokens, ids): (Vec<&AddedToken>, Vec<u32>) = non_normalized.into_iter().unzip();
let trie = AhoCorasickBuilder::new()
.match_kind(MatchKind::LeftmostLongest)
.build(tokens.iter().map(|token| &token.content));
.build(tokens.iter().map(|token| &token.content))
.expect("Failed to build tried when refreshing tokens");
self.split_trie = (trie, ids);

let (ntokens, nids): (Vec<&AddedToken>, Vec<u32>) = normalized.into_iter().unzip();
Expand All @@ -330,7 +333,8 @@ impl AddedVocabulary {
.collect();
let normalized_trie = AhoCorasickBuilder::new()
.match_kind(MatchKind::LeftmostLongest)
.build(patterns.iter().map(|content| content.get()));
.build(patterns.iter().map(|content| content.get()))
.expect("Failed to build tried when refreshing tokens (normalized)");
self.split_normalized_trie = (normalized_trie, nids);
}

Expand Down
15 changes: 5 additions & 10 deletions tokenizers/src/tokenizer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1078,11 +1078,11 @@ where
let progress = ProgressBar::new(len);
progress.set_style(
ProgressStyle::default_bar()
.template("[{elapsed_precise}] {msg:<40!} {wide_bar} {percent:>18!}%"),
.template("[{elapsed_precise}] {msg:<30!} {wide_bar} {percent:>18!}%")
.expect("Invalid progress template"),
);
progress
.set_message(&format!("Pre-processing files ({:.2} Mo)", len / 1_000_000));
progress.set_draw_delta(len / 100); // Redraw only every 2%
.set_message(format!("Pre-processing files ({:.2} Mo)", len / 1_000_000));
Some(progress)
} else {
None
Expand Down Expand Up @@ -1131,15 +1131,10 @@ where
let progress = ProgressBar::new(len);
progress.set_style(
ProgressStyle::default_bar()
.template("[{elapsed_precise}] {msg:<40!} {wide_bar} {pos:<9!}/{len:>9!}"),
.template("[{elapsed_precise}] {msg:<30!} {wide_bar} {pos:<9!}/{len:>9!}")
.expect("Invalid progress template"),
);
progress.set_message("Pre-processing sequences");
if len > 0 {
progress.set_draw_delta(len / 100); // Redraw only every 2%
} else {
// Trying to have a good default to avoid progress tracking being the bottleneck
progress.set_draw_delta(1000);
}
Some(progress)
} else {
None
Expand Down
8 changes: 4 additions & 4 deletions tokenizers/src/utils/progress.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@ pub(crate) use indicatif::{ProgressBar, ProgressStyle};

#[cfg(not(feature = "progressbar"))]
mod progressbar {
use std::borrow::Cow;
pub struct ProgressBar;
impl ProgressBar {
pub fn new(_length: u64) -> Self {
Self {}
}

pub fn set_length(&self, _length: u64) {}
pub fn set_draw_delta(&self, _draw_delta: u64) {}
pub fn set_message(&self, _message: &str) {}
pub fn set_message(&self, _message: impl Into<Cow<'static, str>>) {}
pub fn finish(&self) {}
pub fn reset(&self) {}
pub fn inc(&self, _inc: u64) {}
Expand All @@ -23,8 +23,8 @@ mod progressbar {
pub fn default_bar() -> Self {
Self {}
}
pub fn template(self, _template: &str) -> Self {
self
pub fn template(self, _template: &str) -> Result<Self, String> {
Ok(self)
}
}
}
Expand Down

0 comments on commit aed491d

Please sign in to comment.