From 21b3517c9cc2ed5d34260e1aeaa1abec055e3c3d Mon Sep 17 00:00:00 2001 From: hippietrail Date: Wed, 12 Mar 2025 00:54:04 +0800 Subject: [PATCH 1/2] feat: split-words linter counterpart to merge-words --- harper-comments/tests/language_support.rs | 2 +- .../javadoc_complex.java | 2 +- .../tests/language_support_sources/jsdoc.ts | 6 +- .../multiline_comments.cpp | 2 +- .../multiline_comments.ts | 2 +- harper-core/src/linting/lint_group.rs | 2 + harper-core/src/linting/mod.rs | 2 + harper-core/src/linting/split_words.rs | 128 ++++++++++++++++++ .../tests/test_sources/chinese_lorem_ipsum.md | 2 +- harper-core/tests/test_sources/pr_504.md | 2 +- ...omplex_document_with_spelling_mistakes.typ | 4 +- 11 files changed, 143 insertions(+), 11 deletions(-) create mode 100644 harper-core/src/linting/split_words.rs diff --git a/harper-comments/tests/language_support.rs b/harper-comments/tests/language_support.rs index fc2d48b88..78f566716 100644 --- a/harper-comments/tests/language_support.rs +++ b/harper-comments/tests/language_support.rs @@ -49,7 +49,7 @@ create_test!(merged_lines.ts, 1); create_test!(javadoc_clean_simple.java, 0); create_test!(javadoc_complex.java, 4); create_test!(issue_132.rs, 1); -create_test!(laravel_app.php, 2); +create_test!(laravel_app.php, 3); // These are to make sure nothing crashes. create_test!(empty.js, 0); diff --git a/harper-comments/tests/language_support_sources/javadoc_complex.java b/harper-comments/tests/language_support_sources/javadoc_complex.java index 054f4c464..2b79c0ec5 100644 --- a/harper-comments/tests/language_support_sources/javadoc_complex.java +++ b/harper-comments/tests/language_support_sources/javadoc_complex.java @@ -17,7 +17,7 @@ public static void main(String[] args) { } /** - * This doc has a link in it: {@link this sould b ignor} but not tis + * This doc has a link in it: {@link this sould b ignor} but not thsi * * @param name this is anoher test. */ diff --git a/harper-comments/tests/language_support_sources/jsdoc.ts b/harper-comments/tests/language_support_sources/jsdoc.ts index f46178f33..d9aac8587 100644 --- a/harper-comments/tests/language_support_sources/jsdoc.ts +++ b/harper-comments/tests/language_support_sources/jsdoc.ts @@ -1,14 +1,14 @@ /** This is a doc comment. - * Since there are no keywords it _sould_ be checked. */ + * Since there are no keywords it _shuld_ be checked. */ function test(){} /** This is also a doc comment. * @class this sould be unchecked. */ class Clazz { } -/** Here is another example: {@link this sould also b unchecked}. But this _sould_ be.*/ +/** Here is another example: {@link this sould also b unchecked}. But this _shuold_ be.*/ -/** However, tis should be checked, while {@link tis should not} */ +/** However, thsi should be checked, while {@link tis should not} */ /** * The following examples should be ignored by Harper. diff --git a/harper-comments/tests/language_support_sources/multiline_comments.cpp b/harper-comments/tests/language_support_sources/multiline_comments.cpp index 5eeeffcb0..65dcde19a 100644 --- a/harper-comments/tests/language_support_sources/multiline_comments.cpp +++ b/harper-comments/tests/language_support_sources/multiline_comments.cpp @@ -9,4 +9,4 @@ int test() {} */ int arbitrary() {} -/// Let's aadd a cuple spelling errors for good measure. +/// Let's putin a cuple spelling errors for good measure. diff --git a/harper-comments/tests/language_support_sources/multiline_comments.ts b/harper-comments/tests/language_support_sources/multiline_comments.ts index 07f3a8cfa..e5c4939c0 100644 --- a/harper-comments/tests/language_support_sources/multiline_comments.ts +++ b/harper-comments/tests/language_support_sources/multiline_comments.ts @@ -9,5 +9,5 @@ function test() {} */ function arbitrary() {} -// Let's aadd a cuple spelling errors for good measure. +// Let's putin a cuple spelling errors for good measure. diff --git a/harper-core/src/linting/lint_group.rs b/harper-core/src/linting/lint_group.rs index 9eeba6f89..0f6128eaa 100644 --- a/harper-core/src/linting/lint_group.rs +++ b/harper-core/src/linting/lint_group.rs @@ -53,6 +53,7 @@ use super::somewhat_something::SomewhatSomething; use super::spaces::Spaces; use super::spell_check::SpellCheck; use super::spelled_numbers::SpelledNumbers; +use super::split_words::SplitWords; use super::terminating_conjunctions::TerminatingConjunctions; use super::that_which::ThatWhich; use super::then_than::ThenThan; @@ -327,6 +328,7 @@ impl LintGroup { insert_pattern_rule!(ThatWhich, true); insert_struct_rule!(CapitalizePersonalPronouns, true); insert_struct_rule!(MergeWords, true); + insert_struct_rule!(SplitWords, true); insert_pattern_rule!(PluralConjugate, false); insert_struct_rule!(OxfordComma, true); insert_struct_rule!(NoOxfordComma, false); diff --git a/harper-core/src/linting/mod.rs b/harper-core/src/linting/mod.rs index 5a3728412..c854cde47 100644 --- a/harper-core/src/linting/mod.rs +++ b/harper-core/src/linting/mod.rs @@ -55,6 +55,7 @@ mod somewhat_something; mod spaces; mod spell_check; mod spelled_numbers; +mod split_words; mod suggestion; mod terminating_conjunctions; mod that_which; @@ -114,6 +115,7 @@ pub use somewhat_something::SomewhatSomething; pub use spaces::Spaces; pub use spell_check::SpellCheck; pub use spelled_numbers::SpelledNumbers; +pub use split_words::SplitWords; pub use suggestion::Suggestion; pub use terminating_conjunctions::TerminatingConjunctions; pub use that_which::ThatWhich; diff --git a/harper-core/src/linting/split_words.rs b/harper-core/src/linting/split_words.rs new file mode 100644 index 000000000..961ee56e5 --- /dev/null +++ b/harper-core/src/linting/split_words.rs @@ -0,0 +1,128 @@ +use std::sync::Arc; + +use crate::{CharString, Dictionary, Document, FstDictionary}; + +use super::{Lint, LintKind, Linter, Suggestion}; + +pub struct SplitWords { + dict: Arc, +} + +impl SplitWords { + pub fn new() -> Self { + Self { + dict: FstDictionary::curated(), + } + } +} + +impl Default for SplitWords { + fn default() -> Self { + Self::new() + } +} + +impl Linter for SplitWords { + fn lint(&mut self, document: &Document) -> Vec { + let mut lints = Vec::new(); + + let (mut word1, mut word2) = (CharString::new(), CharString::new()); + + for w in document.tokens() { + if !w.kind.is_word() { + continue; + } + + if w.span.len() < 2 { + continue; + } + + let w_chars = document.get_span_content(w.span); + + if self.dict.contains_word(w_chars) { + continue; + } + + let mut found = false; + + for i in 1..w_chars.len() { + let midpoint = w_chars.len() / 2; + let midpoint = if i & 1 == 0 { + midpoint + i / 2 + } else { + midpoint - i / 2 + }; + + let first_half = &w_chars[..midpoint]; + let second_half = &w_chars[midpoint..]; + + word1.clear(); + word1.extend_from_slice(first_half); + word2.clear(); + word2.extend_from_slice(second_half); + + if self.dict.contains_exact_word(&word1) && self.dict.contains_exact_word(&word2) { + let mut open = word1.clone(); + open.push(' '); + open.extend_from_slice(second_half); + + lints.push(Lint { + span: w.span, + lint_kind: LintKind::WordChoice, + suggestions: vec![Suggestion::ReplaceWith(open.to_vec())], + message: "It seems this is actually two words joined together.".to_owned(), + priority: 63, + }); + found = true; + } + + // The following logic won't be useful unless and until hyphenated words are added to the dictionary + + let mut hyphenated = word1.clone(); + hyphenated.push('-'); + hyphenated.extend_from_slice(second_half); + + if self.dict.contains_exact_word(&hyphenated) { + lints.push(Lint { + span: w.span, + lint_kind: LintKind::WordChoice, + suggestions: vec![Suggestion::ReplaceWith(hyphenated.to_vec())], + message: "It seems this is actually two words joined together.".to_owned(), + priority: 63, + }); + found = true; + } + + if found { + break; + } + } + } + lints + } + + fn description(&self) -> &str { + "Accidentally forgetting a space between words is common. This rule looks for valid words that are joined together without whitespace." + } +} + +#[cfg(test)] +mod tests { + use crate::linting::tests::{assert_lint_count, assert_suggestion_result}; + + use super::SplitWords; + + #[test] + fn heretofore() { + assert_lint_count( + "onetwo threefour fivesix seveneight nineten.", + SplitWords::default(), + 5, + ); + } + + #[test] + fn foobar() { + assert_suggestion_result("moreso", SplitWords::default(), "more so"); + } +} diff --git a/harper-core/tests/test_sources/chinese_lorem_ipsum.md b/harper-core/tests/test_sources/chinese_lorem_ipsum.md index 75c60f701..371e4eac2 100644 --- a/harper-core/tests/test_sources/chinese_lorem_ipsum.md +++ b/harper-core/tests/test_sources/chinese_lorem_ipsum.md @@ -1,4 +1,4 @@ -The following text was generated using [a Chinese lorem ipsum generator](https://pinkylam.me/generator/chinese-lorem-ipsum/). +The following text was generated using [a Chinese lorrm ipsum generator](https://pinkylam.me/generator/chinese-lorem-ipsum/). 食棵支每躲種。奶象打星爪子二細喜才記行在發像原斤!頁固點子衣點豆看身蝴看苗急午公何足,筆娘經色蝶行元香也要。麻了綠尼固世,色北書目登功;因告黑。 diff --git a/harper-core/tests/test_sources/pr_504.md b/harper-core/tests/test_sources/pr_504.md index 3ff95566b..915c9e9d1 100644 --- a/harper-core/tests/test_sources/pr_504.md +++ b/harper-core/tests/test_sources/pr_504.md @@ -4,4 +4,4 @@ These say "This is in Greek/Georgian/Thai" in those languages: ეს ქართულად. นี่มันภาษาไทย -This is English with misstakes. +This is English with erors. diff --git a/harper-typst/tests/test_sources/complex_document_with_spelling_mistakes.typ b/harper-typst/tests/test_sources/complex_document_with_spelling_mistakes.typ index 47d0bbf16..4738c71c0 100644 --- a/harper-typst/tests/test_sources/complex_document_with_spelling_mistakes.typ +++ b/harper-typst/tests/test_sources/complex_document_with_spelling_mistakes.typ @@ -55,7 +55,7 @@ #titleblock( title: "A fluid dynamic model for glaier flow", authors: ("Grant Lemons", "John Doe", "Jane Doe"), - abstract: lorem(80), + abstract: lorrm(80), doc, ) ] @@ -63,5 +63,5 @@ = Introduction #lorem(300) -= Related ork += Related wrk #lorem(200) From d5a3dd5c72d433fc2e8b2f5b95c3a5bc729cdb36 Mon Sep 17 00:00:00 2001 From: hippietrail Date: Tue, 6 May 2025 22:39:50 +0700 Subject: [PATCH 2/2] fix: commit split_words.rs change --- harper-core/src/linting/split_words.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/harper-core/src/linting/split_words.rs b/harper-core/src/linting/split_words.rs index 961ee56e5..b629432c4 100644 --- a/harper-core/src/linting/split_words.rs +++ b/harper-core/src/linting/split_words.rs @@ -37,7 +37,7 @@ impl Linter for SplitWords { continue; } - let w_chars = document.get_span_content(w.span); + let w_chars = document.get_span_content(&w.span); if self.dict.contains_word(w_chars) { continue;