Skip to content

Commit 3b43861

Browse files
committed
feat(core): improved performance by enhancing cache localization
1 parent 4b6f7e5 commit 3b43861

File tree

4 files changed

+168
-88
lines changed

4 files changed

+168
-88
lines changed

harper-core/src/linting/oxford_comma.rs

+2-5
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
use crate::{
2-
patterns::{EitherPattern, Pattern, SequencePattern},
2+
patterns::{Pattern, SequencePattern, WordSet},
33
Document, Token, TokenStringExt,
44
};
55

@@ -21,10 +21,7 @@ impl OxfordComma {
2121
))
2222
.then_noun_phrase()
2323
.then_whitespace()
24-
.then(Box::new(EitherPattern::new(vec![
25-
Box::new(SequencePattern::aco("and")),
26-
Box::new(SequencePattern::aco("or")),
27-
])))
24+
.then(Box::new(WordSet::all(&["and", "or"])))
2825
.then_whitespace()
2926
.then_noun_phrase(),
3027
}

harper-core/src/linting/proper_noun_capitalization_linters.rs

+71-83
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
use super::PatternLinter;
22
use super::{Lint, LintKind, Suggestion};
33
use crate::make_title_case;
4-
use crate::patterns::{EitherPattern, IsNotTitleCase, Pattern, SequencePattern};
4+
use crate::patterns::{EitherPattern, IsNotTitleCase, Pattern, SequencePattern, WordSet};
55
use crate::FstDictionary;
66
use crate::{Token, TokenStringExt};
77
use std::sync::Arc;
@@ -62,10 +62,7 @@ macro_rules! create_linter_for {
6262
create_linter_for!(
6363
Americas,
6464
SequencePattern::default()
65-
.then(Box::new(EitherPattern::new(vec![
66-
Box::new(SequencePattern::aco("South")),
67-
Box::new(SequencePattern::aco("North"))
68-
])))
65+
.then(Box::new(WordSet::all(&["South", "North",])))
6966
.then_whitespace()
7067
.t_aco("America"),
7168
"When referring to the continents, make sure to treat them as a proper noun."
@@ -74,10 +71,7 @@ create_linter_for!(
7471
create_linter_for!(
7572
Koreas,
7673
SequencePattern::default()
77-
.then(Box::new(EitherPattern::new(vec![
78-
Box::new(SequencePattern::aco("South")),
79-
Box::new(SequencePattern::aco("North"))
80-
])))
74+
.then(Box::new(WordSet::all(&["South", "North",])))
8175
.then_whitespace()
8276
.t_aco("Korea"),
8377
"When referring to the nations, make sure to treat them as a proper noun."
@@ -119,25 +113,27 @@ create_linter_for!(
119113
Box::new(
120114
SequencePattern::default()
121115
.then(Box::new(EitherPattern::new(vec![
122-
Box::new(SequencePattern::aco("Presidents'")),
123-
Box::new(SequencePattern::aco("Valentines")),
124-
Box::new(SequencePattern::aco("Christmas")),
125-
Box::new(SequencePattern::aco("Easter")),
126-
Box::new(SequencePattern::aco("Flag")),
127-
Box::new(SequencePattern::aco("Independence")),
128-
Box::new(SequencePattern::aco("Mothers'")),
129-
Box::new(SequencePattern::aco("New").t_aco("Years")),
130-
Box::new(SequencePattern::aco("Fathers'")),
131-
Box::new(SequencePattern::aco("Columbus")),
132-
Box::new(SequencePattern::aco("Thanksgiving")),
133-
Box::new(SequencePattern::aco("Memorial")),
134-
Box::new(SequencePattern::aco("May")),
135-
Box::new(SequencePattern::aco("Halloween")),
136-
Box::new(SequencePattern::aco("Tax")),
137-
Box::new(SequencePattern::aco("Parents")),
138-
Box::new(SequencePattern::aco("Veterans")),
139-
Box::new(SequencePattern::aco("Armistice")),
140-
Box::new(SequencePattern::aco("Groundhog")),
116+
Box::new(WordSet::all(&[
117+
"Presidents'",
118+
"Valentines",
119+
"Christmas",
120+
"Easter",
121+
"Flag",
122+
"Independence",
123+
"Mothers'",
124+
"Years",
125+
"Fathers'",
126+
"Columbus",
127+
"Thanksgiving",
128+
"Memorial",
129+
"May",
130+
"Halloween",
131+
"Tax",
132+
"Parents",
133+
"Veterans",
134+
"Armistice",
135+
"Groundhog"
136+
])),
141137
Box::new(
142138
SequencePattern::default()
143139
.t_aco("National")
@@ -249,30 +245,30 @@ create_linter_for!(
249245
SequencePattern::default()
250246
.t_aco("Google")
251247
.then_whitespace()
252-
.then(Box::new(EitherPattern::new(vec![
253-
Box::new(SequencePattern::aco("Search")),
254-
Box::new(SequencePattern::aco("Cloud")),
255-
Box::new(SequencePattern::aco("Maps")),
256-
Box::new(SequencePattern::aco("Docs")),
257-
Box::new(SequencePattern::aco("Sheets")),
258-
Box::new(SequencePattern::aco("Slides")),
259-
Box::new(SequencePattern::aco("Drive")),
260-
Box::new(SequencePattern::aco("Meet")),
261-
Box::new(SequencePattern::aco("Gmail")),
262-
Box::new(SequencePattern::aco("Calendar")),
263-
Box::new(SequencePattern::aco("Chrome")),
264-
Box::new(SequencePattern::aco("ChromeOS")),
265-
Box::new(SequencePattern::aco("Android")),
266-
Box::new(SequencePattern::aco("Play")),
267-
Box::new(SequencePattern::aco("Bard")),
268-
Box::new(SequencePattern::aco("Gemini")),
269-
Box::new(SequencePattern::aco("YouTube")),
270-
Box::new(SequencePattern::aco("Photos")),
271-
Box::new(SequencePattern::aco("Analytics")),
272-
Box::new(SequencePattern::aco("AdSense")),
273-
Box::new(SequencePattern::aco("Pixel")),
274-
Box::new(SequencePattern::aco("Nest")),
275-
Box::new(SequencePattern::aco("Workspace"))
248+
.then(Box::new(WordSet::all(&[
249+
"Search",
250+
"Cloud",
251+
"Maps",
252+
"Docs",
253+
"Sheets",
254+
"Slides",
255+
"Drive",
256+
"Meet",
257+
"Gmail",
258+
"Calendar",
259+
"Chrome",
260+
"ChromeOS",
261+
"Android",
262+
"Play",
263+
"Bard",
264+
"Gemini",
265+
"YouTube",
266+
"Photos",
267+
"Analytics",
268+
"AdSense",
269+
"Pixel",
270+
"Nest",
271+
"Workspace",
276272
]))),
277273
"When referring to Google products and services, make sure to treat them as proper nouns."
278274
);
@@ -357,20 +353,22 @@ create_linter_for!(
357353
.t_aco("Microsoft")
358354
.then_whitespace()
359355
.then(Box::new(EitherPattern::new(vec![
360-
Box::new(SequencePattern::aco("Windows")),
361-
Box::new(SequencePattern::aco("Office")),
362-
Box::new(SequencePattern::aco("Teams")),
363-
Box::new(SequencePattern::aco("Excel")),
364-
Box::new(SequencePattern::aco("PowerPoint")),
365-
Box::new(SequencePattern::aco("Word")),
366-
Box::new(SequencePattern::aco("Outlook")),
367-
Box::new(SequencePattern::aco("OneDrive")),
368-
Box::new(SequencePattern::aco("SharePoint")),
369-
Box::new(SequencePattern::aco("Xbox")),
370-
Box::new(SequencePattern::aco("Surface")),
371-
Box::new(SequencePattern::aco("Edge")),
372-
Box::new(SequencePattern::aco("Bing")),
373-
Box::new(SequencePattern::aco("Dynamics")),
356+
Box::new(WordSet::all(&[
357+
"Windows",
358+
"Office",
359+
"Teams",
360+
"Excel",
361+
"PowerPoint",
362+
"Word",
363+
"Outlook",
364+
"OneDrive",
365+
"SharePoint",
366+
"Xbox",
367+
"Surface",
368+
"Edge",
369+
"Bing",
370+
"Dynamics",
371+
])),
374372
Box::new(
375373
SequencePattern::default()
376374
.t_aco("Visual")
@@ -387,10 +385,10 @@ create_linter_for!(
387385
.t_aco("Apple")
388386
.then_whitespace()
389387
.then(Box::new(EitherPattern::new(vec![
390-
Box::new(SequencePattern::aco("iPhone")),
391-
Box::new(SequencePattern::aco("iPad")),
392-
Box::new(SequencePattern::aco("iMac")),
393-
Box::new(SequencePattern::aco("MacBook")),
388+
Box::new(WordSet::all(&[
389+
"iPhone", "iPad", "iMac", "MacBook", "Watch", "TV", "Music", "Arcade", "iCloud",
390+
"Safari", "HomeKit", "CarPlay",
391+
])),
394392
Box::new(
395393
SequencePattern::aco("MacBook")
396394
.then_whitespace()
@@ -414,14 +412,6 @@ create_linter_for!(
414412
.then_whitespace()
415413
.t_aco("Max")
416414
),
417-
Box::new(SequencePattern::aco("Watch")),
418-
Box::new(SequencePattern::aco("TV")),
419-
Box::new(SequencePattern::aco("Music")),
420-
Box::new(SequencePattern::aco("Arcade")),
421-
Box::new(SequencePattern::aco("iCloud")),
422-
Box::new(SequencePattern::aco("Safari")),
423-
Box::new(SequencePattern::aco("HomeKit")),
424-
Box::new(SequencePattern::aco("CarPlay")),
425415
Box::new(
426416
SequencePattern::default()
427417
.t_aco("Vision")
@@ -437,11 +427,9 @@ create_linter_for!(
437427
SequencePattern::aco("Meta")
438428
.then_whitespace()
439429
.then(Box::new(EitherPattern::new(vec![
440-
Box::new(SequencePattern::aco("Oculus")),
441-
Box::new(SequencePattern::aco("Portals")),
442-
Box::new(SequencePattern::aco("Quest")),
443-
Box::new(SequencePattern::aco("Gaming")),
444-
Box::new(SequencePattern::aco("Horizon")),
430+
Box::new(WordSet::all(&[
431+
"Oculus", "Portals", "Quest", "Gaming", "Horizon",
432+
])),
445433
Box::new(
446434
SequencePattern::default()
447435
.t_aco("Reality")

harper-core/src/patterns/mod.rs

+2
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ mod sequence_pattern;
1414
mod token_kind_pattern_group;
1515
mod whitespace_pattern;
1616
mod word_pattern_group;
17+
mod word_set;
1718

1819
pub use any_pattern::AnyPattern;
1920
use blanket::blanket;
@@ -28,6 +29,7 @@ pub use sequence_pattern::SequencePattern;
2829
pub use token_kind_pattern_group::TokenKindPatternGroup;
2930
pub use whitespace_pattern::WhitespacePattern;
3031
pub use word_pattern_group::WordPatternGroup;
32+
pub use word_set::WordSet;
3133

3234
#[cfg(not(feature = "concurrent"))]
3335
#[blanket(derive(Rc, Arc))]

harper-core/src/patterns/word_set.rs

+93
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
use super::Pattern;
2+
use smallvec::SmallVec;
3+
4+
use crate::{CharString, Token};
5+
6+
// A [`Pattern`] that matches against any of a set of provided words.
7+
// For small sets of short words, it doesn't allocate.
8+
//
9+
// Note that any capitalization of the contained words will result in a match.
10+
#[derive(Debug, Default, Clone)]
11+
pub struct WordSet {
12+
words: SmallVec<[CharString; 4]>,
13+
}
14+
15+
impl WordSet {
16+
pub fn add(&mut self, word: &str) {
17+
let chars = word.chars().collect();
18+
19+
if !self.words.contains(&chars) {
20+
self.words.push(chars);
21+
}
22+
}
23+
24+
pub fn all(words: &[&'static str]) -> Self {
25+
let mut set = Self::default();
26+
27+
for str in words {
28+
set.add(str);
29+
}
30+
31+
set
32+
}
33+
}
34+
35+
impl Pattern for WordSet {
36+
fn matches(&self, tokens: &[Token], source: &[char]) -> usize {
37+
let Some(tok) = tokens.first() else {
38+
return 0;
39+
};
40+
41+
if !tok.kind.is_word() {
42+
return 0;
43+
}
44+
45+
let tok_chars = tok.span.get_content(source);
46+
47+
for word in &self.words {
48+
if tok_chars.len() != word.len() {
49+
continue;
50+
}
51+
52+
let partial_match = tok_chars
53+
.iter()
54+
.zip(word)
55+
.all(|(a, b)| a.to_ascii_lowercase() == b.to_ascii_lowercase());
56+
57+
if partial_match {
58+
return 1;
59+
}
60+
}
61+
62+
0
63+
}
64+
}
65+
66+
#[cfg(test)]
67+
mod tests {
68+
use crate::{patterns::DocPattern, Document, Span};
69+
70+
use super::WordSet;
71+
72+
#[test]
73+
fn fruit() {
74+
let set = WordSet::all(&["banana", "apple", "orange"]);
75+
76+
let doc = Document::new_markdown_curated("I ate a banana and an apple today.");
77+
78+
let matches = set.find_all_matches_in_doc(&doc);
79+
80+
assert_eq!(matches, vec![Span::new(6, 7), Span::new(12, 13)]);
81+
}
82+
83+
#[test]
84+
fn fruit_whack_capitalization() {
85+
let set = WordSet::all(&["banana", "apple", "orange"]);
86+
87+
let doc = Document::new_markdown_curated("I Ate A bAnaNa And aN apPlE today.");
88+
89+
let matches = set.find_all_matches_in_doc(&doc);
90+
91+
assert_eq!(matches, vec![Span::new(6, 7), Span::new(12, 13)]);
92+
}
93+
}

0 commit comments

Comments
 (0)