-
Notifications
You must be signed in to change notification settings - Fork 6
/
remove_stop_words_with_regex.rs
33 lines (27 loc) · 1.24 KB
/
remove_stop_words_with_regex.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
use human_regex::{exactly, one_or_more, or, punctuation, whitespace, word_boundary};
use stop_words::{get, LANGUAGE};
fn main() {
#[cfg(all(any(feature = "nltk", feature = "iso"), not(feature = "constructed")))]
{
// Read in a file
let document = std::fs::read_to_string("examples/foreword.txt").expect("Cannot read file");
// Print the contents
println!("Original text:\n{}", document);
// Get the stopwords
let words = get(LANGUAGE::English);
// Remove punctuation and lowercase the text to make parsing easier
let lowercase_doc = document.to_ascii_lowercase();
let regex_for_punctuation = one_or_more(punctuation());
let text_without_punctuation = regex_for_punctuation
.to_regex()
.replace_all(&lowercase_doc, "");
// Make a regex to match stopwords with trailing spaces and punctuation
let regex_for_stop_words =
word_boundary() + exactly(1, or(&words)) + word_boundary() + one_or_more(whitespace());
// Remove stop words
let clean_text = regex_for_stop_words
.to_regex()
.replace_all(&text_without_punctuation, "");
println!("\nClean text:\n{}", clean_text);
}
}