Skip to content
2 changes: 2 additions & 0 deletions harper-core/dictionary.dict
Original file line number Diff line number Diff line change
Expand Up @@ -1207,6 +1207,8 @@ Beveridge/O
Beverley/Og
Beverly/Og
Beyer/Og
Bézier/Og
Bezier/Og
Bharat/Og
Bhopal/Og
Bhutan/Og
Expand Down
126 changes: 107 additions & 19 deletions harper-core/src/document.rs
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,8 @@ impl Document {
self.condense_dotted_initialisms();
self.condense_number_suffixes();
self.condense_ellipsis();
self.condense_latin();
self.condense_dotted_latin();
self.condense_loan_phrases();
self.condense_filename_extensions();
self.condense_tldr();
self.condense_ampersand_pairs();
Expand Down Expand Up @@ -439,11 +440,13 @@ impl Document {
self.tokens.remove_indices(remove_these);
}

// Dotted Latin expressions such as etc. vs. et al.

thread_local! {
static LATIN_EXPR: Lrc<FirstMatchOf> = Document::uncached_latin_expr();
static DOTTED_LATIN_EXPR: Lrc<FirstMatchOf> = Document::uncached_dotted_latin_expr();
}

fn uncached_latin_expr() -> Lrc<FirstMatchOf> {
fn uncached_dotted_latin_expr() -> Lrc<FirstMatchOf> {
Lrc::new(FirstMatchOf::new(vec![
Box::new(
SequenceExpr::default()
Expand All @@ -459,27 +462,73 @@ impl Document {
]))
}

/// Assumes that the first matched token is the canonical one to be condensed into.
/// Takes a callback that can be used to retroactively edit the canonical token afterwards.
fn condense_expr<F>(&mut self, expr: &impl Expr, edit: F)
where
F: Fn(&mut Token),
{
let matches = expr.iter_matches_in_doc(self).collect::<Vec<_>>();
fn condense_dotted_latin(&mut self) {
self.condense_expr(&Self::DOTTED_LATIN_EXPR.with(|v| v.clone()), |_| {})
}

let mut remove_indices = VecDeque::with_capacity(matches.len());
// Loan phrases such as en masse

for m in matches {
remove_indices.extend(m.start + 1..m.end);
self.tokens[m.start].span = self.tokens[m.into_iter()].span().unwrap();
edit(&mut self.tokens[m.start]);
}
thread_local! {
static LOAN_PHRASES_EXPR: Lrc<FirstMatchOf> = Document::uncached_loan_phrases_expr();
}

fn uncached_loan_phrases_expr() -> Lrc<FirstMatchOf> {
Lrc::new(FirstMatchOf::new(
[
"ad nauseam",
"alma mater",
// "avant-garde",
"bona fide",
"contra proferentem",
// "cul-de-sac",
"de facto",
"de jure",
"de minimis",
"déjà vu",
"deja vu",
"en masse",
"gung ho",
"habeas corpus",
"in personam",
"in situ",
"inter alia",
"ipso facto",
"kung fu",
"mutatis mutandis",
"pari passu",
"Pax Americana",
"per annum",
"per capita",
"per definitionem",
"per diem",
"per se",
"prima facie",
"pro rata",
"quid pro quo",
"sui generis",
"tai chi",
// "vis-à-vis",
]
.iter()
.filter(|phrase| phrase.split_whitespace().count() != 0)
.map(|phrase| {
let words: Vec<&str> = phrase.split_whitespace().collect();
let mut seq = SequenceExpr::default();
if !words.is_empty() {
seq = seq.t_aco(words[0]);
for word in &words[1..] {
seq = seq.then_whitespace().t_aco(word);
}
}

self.tokens.remove_indices(remove_indices);
Box::new(seq) as Box<dyn Expr>
})
.collect(),
))
}

fn condense_latin(&mut self) {
self.condense_expr(&Self::LATIN_EXPR.with(|v| v.clone()), |_| {})
fn condense_loan_phrases(&mut self) {
self.condense_expr(&Self::LOAN_PHRASES_EXPR.with(|v| v.clone()), |_| {})
}

/// Searches for multiple sequential newline tokens and condenses them down
Expand Down Expand Up @@ -784,6 +833,8 @@ impl Document {
);
}

// Ellipsis: ...

fn uncached_ellipsis_pattern() -> Lrc<Repeating> {
let period = SequenceExpr::default().then_period();
Lrc::new(Repeating::new(Box::new(period), 2))
Expand All @@ -800,6 +851,8 @@ impl Document {
});
}

// Contractions

fn uncached_contraction_expr() -> Lrc<SequenceExpr> {
Lrc::new(
SequenceExpr::default()
Expand All @@ -820,6 +873,25 @@ impl Document {

self.condense_expr(&expr, |_| {})
}

/// Assumes that the first matched token is the canonical one to be condensed into.
/// Takes a callback that can be used to retroactively edit the canonical token afterwards.
fn condense_expr<F>(&mut self, expr: &impl Expr, edit: F)
where
F: Fn(&mut Token),
{
let matches = expr.iter_matches_in_doc(self).collect::<Vec<_>>();

let mut remove_indices = VecDeque::with_capacity(matches.len());

for m in matches {
remove_indices.extend(m.start + 1..m.end);
self.tokens[m.start].span = self.tokens[m.into_iter()].span().unwrap();
edit(&mut self.tokens[m.start]);
}

self.tokens.remove_indices(remove_indices);
}
}

/// Creates functions necessary to implement [`TokenStringExt]` on a document.
Expand Down Expand Up @@ -1249,6 +1321,22 @@ mod tests {
assert!(doc.tokens[1].kind.is_ampersand() || doc.tokens[7].kind.is_ampersand());
}

#[test]
fn condense_loan_phrases() {
let doc = Document::new_plain_english_curated(
"the 5 indictment case can be reinstated if he feels Adams is not complying with whatever the alleged details are in the largely speculated quid pro quo arrangement of deporting certain immigrants en masse",
);
let (mut quid_pro_quo, mut en_masse) = (false, false);
for tok in &doc.tokens {
match tok.span.get_content_string(&doc.source).as_str() {
"quid pro quo" => quid_pro_quo = true,
"en masse" => en_masse = true,
_ => {}
}
}
assert!(quid_pro_quo && en_masse);
}

#[test]
fn condense_io() {
let doc = Document::new_plain_english_curated("I/O");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -670,17 +670,6 @@ Message: |



Lint: Spelling (63 priority)
Message: |
271 | The Privilege of the Writ of Habeas Corpus shall not be suspended, unless when
| ^~~~~~ Did you mean to spell `Habeas` this way?
Suggest:
- Replace with: “Haber's”
- Replace with: “Hale's”
- Replace with: “Hebe's”



Lint: Spelling (63 priority)
Message: |
274 | No Bill of Attainder or ex post facto Law shall be passed.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -599,7 +599,7 @@
>
#
> The Privilege of the Writ of Habeas Corpus shall not be suspended , unless when
# D NSg/VB P D NSg/VB P ? NSg+ VXB NSg/C NSg/VXB VP/J . C NSg/I/C
# D NSg/VB P D NSg/VB P NSg VXB NSg/C NSg/VXB VP/J . C NSg/I/C
> in Cases of Rebellion or Invasion the public Safety may require it .
# NPr/J/P NPl/V3 P N🅪Sg+ NPr/C NSg D Nᴹ/VB/J N🅪Sg/VB+ NPr/VXB NSg/VB NPr/ISg+ .
>
Expand Down
Loading