diff --git a/harper-core/dictionary.dict b/harper-core/dictionary.dict index c975b64df..70306cc5a 100644 --- a/harper-core/dictionary.dict +++ b/harper-core/dictionary.dict @@ -1207,6 +1207,8 @@ Beveridge/O Beverley/Og Beverly/Og Beyer/Og +Bézier/Og +Bezier/Og Bharat/Og Bhopal/Og Bhutan/Og diff --git a/harper-core/src/document.rs b/harper-core/src/document.rs index 875e99893..cf03408c2 100644 --- a/harper-core/src/document.rs +++ b/harper-core/src/document.rs @@ -136,7 +136,8 @@ impl Document { self.condense_dotted_initialisms(); self.condense_number_suffixes(); self.condense_ellipsis(); - self.condense_latin(); + self.condense_dotted_latin(); + self.condense_loan_phrases(); self.condense_filename_extensions(); self.condense_tldr(); self.condense_ampersand_pairs(); @@ -439,11 +440,13 @@ impl Document { self.tokens.remove_indices(remove_these); } + // Dotted Latin expressions such as etc. vs. et al. + thread_local! { - static LATIN_EXPR: Lrc = Document::uncached_latin_expr(); + static DOTTED_LATIN_EXPR: Lrc = Document::uncached_dotted_latin_expr(); } - fn uncached_latin_expr() -> Lrc { + fn uncached_dotted_latin_expr() -> Lrc { Lrc::new(FirstMatchOf::new(vec![ Box::new( SequenceExpr::default() @@ -459,27 +462,73 @@ impl Document { ])) } - /// Assumes that the first matched token is the canonical one to be condensed into. - /// Takes a callback that can be used to retroactively edit the canonical token afterwards. - fn condense_expr(&mut self, expr: &impl Expr, edit: F) - where - F: Fn(&mut Token), - { - let matches = expr.iter_matches_in_doc(self).collect::>(); + fn condense_dotted_latin(&mut self) { + self.condense_expr(&Self::DOTTED_LATIN_EXPR.with(|v| v.clone()), |_| {}) + } - let mut remove_indices = VecDeque::with_capacity(matches.len()); + // Loan phrases such as en masse - for m in matches { - remove_indices.extend(m.start + 1..m.end); - self.tokens[m.start].span = self.tokens[m.into_iter()].span().unwrap(); - edit(&mut self.tokens[m.start]); - } + thread_local! { + static LOAN_PHRASES_EXPR: Lrc = Document::uncached_loan_phrases_expr(); + } + + fn uncached_loan_phrases_expr() -> Lrc { + Lrc::new(FirstMatchOf::new( + [ + "ad nauseam", + "alma mater", + // "avant-garde", + "bona fide", + "contra proferentem", + // "cul-de-sac", + "de facto", + "de jure", + "de minimis", + "déjà vu", + "deja vu", + "en masse", + "gung ho", + "habeas corpus", + "in personam", + "in situ", + "inter alia", + "ipso facto", + "kung fu", + "mutatis mutandis", + "pari passu", + "Pax Americana", + "per annum", + "per capita", + "per definitionem", + "per diem", + "per se", + "prima facie", + "pro rata", + "quid pro quo", + "sui generis", + "tai chi", + // "vis-à-vis", + ] + .iter() + .filter(|phrase| phrase.split_whitespace().count() != 0) + .map(|phrase| { + let words: Vec<&str> = phrase.split_whitespace().collect(); + let mut seq = SequenceExpr::default(); + if !words.is_empty() { + seq = seq.t_aco(words[0]); + for word in &words[1..] { + seq = seq.then_whitespace().t_aco(word); + } + } - self.tokens.remove_indices(remove_indices); + Box::new(seq) as Box + }) + .collect(), + )) } - fn condense_latin(&mut self) { - self.condense_expr(&Self::LATIN_EXPR.with(|v| v.clone()), |_| {}) + fn condense_loan_phrases(&mut self) { + self.condense_expr(&Self::LOAN_PHRASES_EXPR.with(|v| v.clone()), |_| {}) } /// Searches for multiple sequential newline tokens and condenses them down @@ -784,6 +833,8 @@ impl Document { ); } + // Ellipsis: ... + fn uncached_ellipsis_pattern() -> Lrc { let period = SequenceExpr::default().then_period(); Lrc::new(Repeating::new(Box::new(period), 2)) @@ -800,6 +851,8 @@ impl Document { }); } + // Contractions + fn uncached_contraction_expr() -> Lrc { Lrc::new( SequenceExpr::default() @@ -820,6 +873,25 @@ impl Document { self.condense_expr(&expr, |_| {}) } + + /// Assumes that the first matched token is the canonical one to be condensed into. + /// Takes a callback that can be used to retroactively edit the canonical token afterwards. + fn condense_expr(&mut self, expr: &impl Expr, edit: F) + where + F: Fn(&mut Token), + { + let matches = expr.iter_matches_in_doc(self).collect::>(); + + let mut remove_indices = VecDeque::with_capacity(matches.len()); + + for m in matches { + remove_indices.extend(m.start + 1..m.end); + self.tokens[m.start].span = self.tokens[m.into_iter()].span().unwrap(); + edit(&mut self.tokens[m.start]); + } + + self.tokens.remove_indices(remove_indices); + } } /// Creates functions necessary to implement [`TokenStringExt]` on a document. @@ -1249,6 +1321,22 @@ mod tests { assert!(doc.tokens[1].kind.is_ampersand() || doc.tokens[7].kind.is_ampersand()); } + #[test] + fn condense_loan_phrases() { + let doc = Document::new_plain_english_curated( + "the 5 indictment case can be reinstated if he feels Adams is not complying with whatever the alleged details are in the largely speculated quid pro quo arrangement of deporting certain immigrants en masse", + ); + let (mut quid_pro_quo, mut en_masse) = (false, false); + for tok in &doc.tokens { + match tok.span.get_content_string(&doc.source).as_str() { + "quid pro quo" => quid_pro_quo = true, + "en masse" => en_masse = true, + _ => {} + } + } + assert!(quid_pro_quo && en_masse); + } + #[test] fn condense_io() { let doc = Document::new_plain_english_curated("I/O"); diff --git a/harper-core/tests/text/linters/The Constitution of the United States.snap.yml b/harper-core/tests/text/linters/The Constitution of the United States.snap.yml index 8ee577349..5294feda9 100644 --- a/harper-core/tests/text/linters/The Constitution of the United States.snap.yml +++ b/harper-core/tests/text/linters/The Constitution of the United States.snap.yml @@ -670,17 +670,6 @@ Message: | -Lint: Spelling (63 priority) -Message: | - 271 | The Privilege of the Writ of Habeas Corpus shall not be suspended, unless when - | ^~~~~~ Did you mean to spell `Habeas` this way? -Suggest: - - Replace with: “Haber's” - - Replace with: “Hale's” - - Replace with: “Hebe's” - - - Lint: Spelling (63 priority) Message: | 274 | No Bill of Attainder or ex post facto Law shall be passed. diff --git a/harper-core/tests/text/tagged/The Constitution of the United States.md b/harper-core/tests/text/tagged/The Constitution of the United States.md index 596821527..9e80b645f 100644 --- a/harper-core/tests/text/tagged/The Constitution of the United States.md +++ b/harper-core/tests/text/tagged/The Constitution of the United States.md @@ -599,7 +599,7 @@ > # > The Privilege of the Writ of Habeas Corpus shall not be suspended , unless when -# D NSg/VB P D NSg/VB P ? NSg+ VXB NSg/C NSg/VXB VP/J . C NSg/I/C +# D NSg/VB P D NSg/VB P NSg VXB NSg/C NSg/VXB VP/J . C NSg/I/C > in Cases of Rebellion or Invasion the public Safety may require it . # NPr/J/P NPl/V3 P N🅪Sg+ NPr/C NSg D Nᴹ/VB/J N🅪Sg/VB+ NPr/VXB NSg/VB NPr/ISg+ . >