Automattic · hippietrail · Mar 11, 2025 · May 6, 2025 · May 6, 2025 · elijah-potter
diff --git a/harper-comments/tests/language_support.rs b/harper-comments/tests/language_support.rs
@@ -49,7 +49,7 @@ create_test!(merged_lines.ts, 1);
 create_test!(javadoc_clean_simple.java, 0);
 create_test!(javadoc_complex.java, 5);
 create_test!(issue_132.rs, 1);
-create_test!(laravel_app.php, 2);
+create_test!(laravel_app.php, 3);
 create_test!(ignore_shebang_1.sh, 0);
 create_test!(ignore_shebang_2.sh, 0);
 create_test!(ignore_shebang_3.sh, 0);
@@ -58,7 +58,8 @@ create_test!(common.mill, 1);
 
 // Checks that some comments are masked out
 create_test!(ignore_comments.rs, 1);
-create_test!(ignore_comments.c, 1);
+// Both spell_check and split_words linters flag this now
+create_test!(ignore_comments.c, 2);
 
 // These are to make sure nothing crashes.
 create_test!(empty.js, 0);

diff --git a/harper-comments/tests/language_support_sources/javadoc_complex.java b/harper-comments/tests/language_support_sources/javadoc_complex.java
@@ -17,7 +17,7 @@ public static void main(String[] args) {
   }
 
   /**
-   * This doc has a link in it: {@link this sould b ignor} but not tis
+   * This doc has a link in it: {@link this sould b ignor} but not thsi
    *
    * @param name this is anoher test.
    */

diff --git a/harper-comments/tests/language_support_sources/jsdoc.ts b/harper-comments/tests/language_support_sources/jsdoc.ts
@@ -1,14 +1,14 @@
 /** This is a doc comment.
-  * Since there are no keywords it _sould_ be checked. */
+  * Since there are no keywords it _shuld_ be checked. */
 function test(){}
 
 /** This is also a doc comment.
   * @class this sould be unchecked. */
 class Clazz { }
 
-/** Here is another example: {@link this sould also b unchecked}. But this _sould_ be.*/
+/** Here is another example: {@link this sould also b unchecked}. But this _shuold_ be.*/
 
-/** However, tis should be checked, while {@link tis should not} */
+/** However, thsi should be checked, while {@link tis should not} */
 
 /**
  * The following examples should be ignored by Harper.

diff --git a/harper-comments/tests/language_support_sources/multiline_comments.cpp b/harper-comments/tests/language_support_sources/multiline_comments.cpp
@@ -9,4 +9,4 @@ int test() {}
  */
 int arbitrary() {}
 
-/// Let's aadd a cuple spelling errors for good measure.
+/// Let's putin a cuple spelling errors for good measure.
diff --git a/harper-comments/tests/language_support_sources/multiline_comments.ts b/harper-comments/tests/language_support_sources/multiline_comments.ts
@@ -9,5 +9,5 @@ function test() {}
  */
 function arbitrary() {}
 
-// Let's aadd a cuple spelling errors for good measure.
+// Let's putin a cuple spelling errors for good measure.
 
diff --git a/harper-core/src/linting/lint_group.rs b/harper-core/src/linting/lint_group.rs
@@ -60,6 +60,7 @@ use super::somewhat_something::SomewhatSomething;
 use super::spaces::Spaces;
 use super::spell_check::SpellCheck;
 use super::spelled_numbers::SpelledNumbers;
+use super::split_words::SplitWords;
 use super::that_which::ThatWhich;
 use super::the_how_why::TheHowWhy;
 use super::the_my::TheMy;
@@ -350,6 +351,7 @@ impl LintGroup {
         insert_pattern_rule!(SomewhatSomething, true);
         insert_struct_rule!(Spaces, true);
         insert_struct_rule!(SpelledNumbers, false);
+        insert_struct_rule!(SplitWords, true);
         insert_pattern_rule!(ThatWhich, true);
         insert_pattern_rule!(TheHowWhy, true);
         insert_struct_rule!(TheHowWhy, true);

diff --git a/harper-core/src/linting/mod.rs b/harper-core/src/linting/mod.rs
@@ -64,6 +64,7 @@ mod somewhat_something;
 mod spaces;
 mod spell_check;
 mod spelled_numbers;
+mod split_words;
 mod suggestion;
 mod that_which;
 mod the_how_why;
@@ -130,6 +131,7 @@ pub use somewhat_something::SomewhatSomething;
 pub use spaces::Spaces;
 pub use spell_check::SpellCheck;
 pub use spelled_numbers::SpelledNumbers;
+pub use split_words::SplitWords;
 pub use suggestion::Suggestion;
 pub use that_which::ThatWhich;
 pub use the_how_why::TheHowWhy;

diff --git a/harper-core/src/linting/split_words.rs b/harper-core/src/linting/split_words.rs
@@ -0,0 +1,128 @@
+use std::sync::Arc;
+
+use crate::{CharString, Dictionary, Document, FstDictionary};
+
+use super::{Lint, LintKind, Linter, Suggestion};
+
+pub struct SplitWords {
+    dict: Arc<FstDictionary>,
+}
+
+impl SplitWords {
+    pub fn new() -> Self {
+        Self {
+            dict: FstDictionary::curated(),
+        }
+    }
+}
+
+impl Default for SplitWords {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl Linter for SplitWords {
+    fn lint(&mut self, document: &Document) -> Vec<Lint> {
+        let mut lints = Vec::new();
+
+        let (mut word1, mut word2) = (CharString::new(), CharString::new());
+
+        for w in document.tokens() {
+            if !w.kind.is_word() {
+                continue;
+            }
+
+            if w.span.len() < 2 {
+                continue;
+            }
+
+            let w_chars = document.get_span_content(&w.span);
+
+            if self.dict.contains_word(w_chars) {
+                continue;
+            }
+
+            let mut found = false;
+
+            for i in 1..w_chars.len() {
+                let midpoint = w_chars.len() / 2;
+                let midpoint = if i & 1 == 0 {
+                    midpoint + i / 2
+                } else {
+                    midpoint - i / 2
+                };
+
+                let first_half = &w_chars[..midpoint];
+                let second_half = &w_chars[midpoint..];
+
+                word1.clear();
+                word1.extend_from_slice(first_half);
+                word2.clear();
+                word2.extend_from_slice(second_half);
+
+                if self.dict.contains_exact_word(&word1) && self.dict.contains_exact_word(&word2) {
+                    let mut open = word1.clone();
+                    open.push(' ');
+                    open.extend_from_slice(second_half);
+
+                    lints.push(Lint {
+                        span: w.span,
+                        lint_kind: LintKind::WordChoice,
+                        suggestions: vec![Suggestion::ReplaceWith(open.to_vec())],
+                        message: "It seems this is actually two words joined together.".to_owned(),
+                        priority: 63,
+                    });
+                    found = true;
+                }
+
+                // The following logic won't be useful unless and until hyphenated words are added to the dictionary
+
+                let mut hyphenated = word1.clone();
+                hyphenated.push('-');
+                hyphenated.extend_from_slice(second_half);
+
+                if self.dict.contains_exact_word(&hyphenated) {
+                    lints.push(Lint {
+                        span: w.span,
+                        lint_kind: LintKind::WordChoice,
+                        suggestions: vec![Suggestion::ReplaceWith(hyphenated.to_vec())],
+                        message: "It seems this is actually two words joined together.".to_owned(),
+                        priority: 63,
+                    });
+                    found = true;
+                }
+
+                if found {
+                    break;
+                }
+            }
+        }
+        lints
+    }
+
+    fn description(&self) -> &str {
+        "Accidentally forgetting a space between words is common. This rule looks for valid words that are joined together without whitespace."
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::linting::tests::{assert_lint_count, assert_suggestion_result};
+
+    use super::SplitWords;
+
+    #[test]
+    fn heretofore() {
+        assert_lint_count(
+            "onetwo threefour fivesix seveneight nineten.",
+            SplitWords::default(),
+            5,
+        );
+    }
+
+    #[test]
+    fn foobar() {
+        assert_suggestion_result("moreso", SplitWords::default(), "more so");
+    }
+}
diff --git a/harper-core/tests/test_sources/chinese_lorem_ipsum.md b/harper-core/tests/test_sources/chinese_lorem_ipsum.md
@@ -1,4 +1,4 @@
-The following text was generated using [a Chinese lorem ipsum generator](https://pinkylam.me/generator/chinese-lorem-ipsum/).
+The following text was generated using [a Chinese lorrm ipsum generator](https://pinkylam.me/generator/chinese-lorem-ipsum/).
 
 食棵支每躲種。奶象打星爪子二細喜才記行在發像原斤！頁固點子衣點豆看身蝴看苗急午公何足，筆娘經色蝶行元香也要。麻了綠尼固世，色北書目登功；因告黑。
 

diff --git a/harper-core/tests/test_sources/pr_504.md b/harper-core/tests/test_sources/pr_504.md
@@ -4,4 +4,4 @@ These say "This is in Greek/Georgian/Thai" in those languages:
 ეს ქართულად.
 นี่มันภาษาไทย
 
-This is English with misstakes.
+This is English with erors.
diff --git a/harper-typst/tests/test_sources/complex_document_with_spelling_mistakes.typ b/harper-typst/tests/test_sources/complex_document_with_spelling_mistakes.typ
@@ -55,13 +55,13 @@
   #titleblock(
     title: "A fluid dynamic model for glaier flow",
     authors: ("Grant Lemons", "John Doe", "Jane Doe"),
-    abstract: lorem(80),
+    abstract: lorrm(80),
     doc,
   )
 ]
 
 = Introduction
 #lorem(300)
 
-= Related ork
+= Related wrk
 #lorem(200)
-Original file line number
+Diff line change
@@ Expand Up / @@ -17,7 +17,7 @@ public static void main(String[] args) { @@
       }
       /**
-       * This doc has a link in it: {@link this sould b ignor} but not tis
+       * This doc has a link in it: {@link this sould b ignor} but not thsi
        *
        * @param name this is anoher test.
        */
@@ Expand Down @@