A Rust wrapper for Google Tesseract
Add the following line to your Cargo.toml file:
rusty-tesseract = "1.1.10"
- Brings all relevant command-line tesseract functionality to Rust
- Partly based on the Python wrapper for tesseract (i.e. https://github.com/madmaze/pytesseract)
- Enables testing a pre-trained tesseract model and outputting the results in different formats such as strings, bounding boxes, dicts, or dataframes.
Tesseract: https://github.com/tesseract-ocr/tesseract
Create an Image object by specifying a path or alternatively a DynamicImage from the image crate https://docs.rs/image/latest/image/
// you can use the from_path function
let _ = Image::from_path("img/string.png");
// or instantiate Image from a DynamicImage
let dynamic_image = ImageReader::open("img/string.png")
.unwrap()
.decode()
.unwrap();
let img = Image::from_dynamic_image(&dynamic_image).unwrap();
Set tesseract parameters using the Args struct.
let default_args = Args::default();
// the default parameters are
/*
Args {
lang: "eng",
dpi: Some(150),
psm: Some(3),
oem: Some(3),
}
*/
// fill your own argument struct if needed
// Optional arguments are ignored if set to `None`
let mut my_args = Args {
//model language (tesseract default = 'eng')
//available languages can be found by running 'rusty_tesseract::get_tesseract_langs()'
lang: "eng",
//map of config variables
//this example shows a whitelist for the normal alphabet. Multiple arguments are allowed.
//available arguments can be found by running 'rusty_tesseract::get_tesseract_config_parameters()'
config_variables: HashMap::from([(
"tessedit_char_whitelist".into(),
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ".into(),
)]),
dpi: Some(150), // specify DPI for input image
psm: Some(6), // define page segmentation mode 6 (i.e. "Assume a single uniform block of text")
oem: Some(3), // define optical character recognition mode 3 (i.e. "Default, based on what is available")
};
Choose either string, bounding box or data output:
// define parameters
let mut my_args = Args {
lang: "eng",
config_variables: HashMap::from([(
"tessedit_char_whitelist".into(),
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ".into(),
)]),
dpi: Some(150),
psm: Some(6),
oem: Some(3)
};
// string output
let output = rusty_tesseract::image_to_string(&img, &my_args).unwrap();
println!("The String output is: {:?}", output);
// image_to_boxes creates a BoxOutput containing the parsed output from Tesseract when using the "makebox" Parameter
let box_output = rusty_tesseract::image_to_boxes(&img, &my_args).unwrap();
println!(
"The first boxfile symbol is: {}",
box_output.boxes[0].symbol
);
println!("The full boxfile output is:\n{}", box_output.output);
// image_to_data creates a DataOutput containing the parsed output from Tesseract when using the "TSV" Parameter
let data_output = rusty_tesseract::image_to_data(&img, &my_args).unwrap();
let first_text_line = &data_output.data[4];
println!(
"The first text is '{}' with confidence {}",
first_text_line.text, first_text_line.conf
);
println!("The full data output is:\n{}", data_output.output);
//tesseract version
let tesseract_version = rusty_tesseract::get_tesseract_version().unwrap();
println!("The tesseract version is: {:?}", tesseract_version);
//available languages
let tesseract_langs = rusty_tesseract::get_tesseract_langs().unwrap();
println!("The available languages are: {:?}", tesseract_langs);
//available config parameters
let parameters = rusty_tesseract::get_tesseract_config_parameters().unwrap();
println!("Example config parameter: {}", parameters.config_parameters.first().unwrap());
- Fork the repository
- Create a new feature branch (
git checkout -b my-feature-branch-name
) - Commit your new changes (
git commit -m 'commit message' <changed-file>
) - Push changes to the branch (
git push origin my-feature-branch-name
) - Create a Pull Request