From 60e037270c4a49178a244544ea64c06333418f28 Mon Sep 17 00:00:00 2001 From: hippietrail Date: Sun, 6 Apr 2025 18:43:28 +0700 Subject: [PATCH 1/8] feat: handle multiple derivations for words in the metadata --- harper-cli/src/main.rs | 33 +++++++++++-- harper-core/src/fat_token.rs | 4 +- harper-core/src/ignored_lints/lint_context.rs | 13 +++++- harper-core/src/spell/fst_dictionary.rs | 31 +++++++------ harper-core/src/spell/mod.rs | 2 +- harper-core/src/spell/rune/attribute_list.rs | 12 +++-- harper-core/src/spell/word_id.rs | 4 ++ harper-core/src/token_kind.rs | 46 ++++++++++++++++++- harper-core/src/word_metadata.rs | 39 ++++++++++++++-- 9 files changed, 156 insertions(+), 28 deletions(-) diff --git a/harper-cli/src/main.rs b/harper-cli/src/main.rs index c7706c04f..61a1b7fbb 100644 --- a/harper-cli/src/main.rs +++ b/harper-cli/src/main.rs @@ -76,6 +76,8 @@ enum Args { /// The document to mine words from. file: PathBuf, }, + /// Get the word associated with a particular word id. + WordFromId { hash: u64 }, } fn main() -> anyhow::Result<()> { @@ -227,6 +229,21 @@ fn main() -> anyhow::Result<()> { println!("{json}"); + // iterate through any and all derived_from and resolve the word from each wordid + if let Some(metadata) = dictionary.get_word_metadata_str(&word) { + if let Some(derived_from) = &metadata.derived_from { + let derived_words: Vec = derived_from + .iter() + .filter_map(|wordid| dictionary.get_word_from_id(wordid)) + .map(|word| word.iter().collect()) + .collect(); + + if !derived_words.is_empty() { + println!("derived_from: {:?}", derived_words); + } + } + } + Ok(()) } Args::SummarizeLintRecord { file } => { @@ -361,6 +378,11 @@ fn main() -> anyhow::Result<()> { Ok(()) } + Args::WordFromId { hash } => { + let id = WordId::from_hash(hash); + println!("{:?}", dictionary.get_word_from_id(&id)); + Ok(()) + } } } @@ -402,9 +424,14 @@ fn print_word_derivations(word: &str, annot: &str, dictionary: &impl Dictionary) let id = WordId::from_word_str(word); - let children = dictionary - .words_iter() - .filter(|e| dictionary.get_word_metadata(e).unwrap().derived_from == Some(id)); + let children = dictionary.words_iter().filter(|e| { + dictionary + .get_word_metadata(e) + .unwrap() + .derived_from + .as_ref() + .is_some_and(|derived| derived.contains(&id)) + }); println!(" - {}", word); diff --git a/harper-core/src/fat_token.rs b/harper-core/src/fat_token.rs index 00770ac19..08caebbef 100644 --- a/harper-core/src/fat_token.rs +++ b/harper-core/src/fat_token.rs @@ -4,7 +4,7 @@ use crate::{CharStringExt, TokenKind}; /// A [`Token`](crate::Token) that holds its content as a fat [`Vec`] rather than as a /// [`Span`](crate::Span). -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, PartialOrd, Hash, Eq)] +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Hash, Eq)] pub struct FatToken { pub content: Vec, pub kind: TokenKind, @@ -20,7 +20,7 @@ impl From for FatToken { } /// Similar to a [`FatToken`], but uses a [`String`] as the underlying store. -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, PartialOrd, Hash, Eq)] +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] pub struct FatStringToken { pub content: String, pub kind: TokenKind, diff --git a/harper-core/src/ignored_lints/lint_context.rs b/harper-core/src/ignored_lints/lint_context.rs index f502a44b0..2cb629611 100644 --- a/harper-core/src/ignored_lints/lint_context.rs +++ b/harper-core/src/ignored_lints/lint_context.rs @@ -1,4 +1,5 @@ use serde::{Deserialize, Serialize}; +use std::hash::{Hash, Hasher}; use crate::{ Document, FatToken, @@ -7,7 +8,7 @@ use crate::{ /// A location-agnostic structure that attempts to captures the context and content that a [`Lint`] /// occurred. -#[derive(Debug, Hash, Serialize, Deserialize)] +#[derive(Debug, Serialize, Deserialize)] pub struct LintContext { pub lint_kind: LintKind, pub suggestions: Vec, @@ -16,6 +17,16 @@ pub struct LintContext { pub tokens: Vec, } +impl Hash for LintContext { + fn hash(&self, state: &mut H) { + self.lint_kind.hash(state); + self.suggestions.hash(state); + self.message.hash(state); + self.priority.hash(state); + self.tokens.hash(state); + } +} + impl LintContext { pub fn from_lint(lint: &Lint, document: &Document) -> Self { let Lint { diff --git a/harper-core/src/spell/fst_dictionary.rs b/harper-core/src/spell/fst_dictionary.rs index 947e39902..b07205c1c 100644 --- a/harper-core/src/spell/fst_dictionary.rs +++ b/harper-core/src/spell/fst_dictionary.rs @@ -307,26 +307,27 @@ mod tests { #[test] fn plural_llamas_derived_from_llama() { let dict = FstDictionary::curated(); - - assert_eq!( + assert!( dict.get_word_metadata_str("llamas") .unwrap() .derived_from - .unwrap(), - WordId::from_word_str("llama") - ) + .as_ref() + .unwrap() + .contains(&WordId::from_word_str("llama")) + ); } #[test] fn plural_cats_derived_from_cat() { let dict = FstDictionary::curated(); - assert_eq!( + assert!( dict.get_word_metadata_str("cats") .unwrap() .derived_from - .unwrap(), - WordId::from_word_str("cat") + .as_ref() + .unwrap() + .contains(&WordId::from_word_str("cat")) ); } @@ -334,12 +335,13 @@ mod tests { fn unhappy_derived_from_happy() { let dict = FstDictionary::curated(); - assert_eq!( + assert!( dict.get_word_metadata_str("unhappy") .unwrap() .derived_from - .unwrap(), - WordId::from_word_str("happy") + .as_ref() + .unwrap() + .contains(&WordId::from_word_str("happy")) ); } @@ -347,12 +349,13 @@ mod tests { fn quickly_derived_from_quick() { let dict = FstDictionary::curated(); - assert_eq!( + assert!( dict.get_word_metadata_str("quickly") .unwrap() .derived_from - .unwrap(), - WordId::from_word_str("quick") + .as_ref() + .unwrap() + .contains(&WordId::from_word_str("quick")) ); } } diff --git a/harper-core/src/spell/mod.rs b/harper-core/src/spell/mod.rs index dd96db310..a5658e55d 100644 --- a/harper-core/src/spell/mod.rs +++ b/harper-core/src/spell/mod.rs @@ -14,7 +14,7 @@ mod rune; mod word_id; mod word_map; -#[derive(PartialEq, Debug, Hash, Eq)] +#[derive(PartialEq, Debug, Eq)] pub struct FuzzyMatchResult<'a> { pub word: &'a [char], pub edit_distance: u8, diff --git a/harper-core/src/spell/rune/attribute_list.rs b/harper-core/src/spell/rune/attribute_list.rs index 4e5686408..d288ac856 100644 --- a/harper-core/src/spell/rune/attribute_list.rs +++ b/harper-core/src/spell/rune/attribute_list.rs @@ -1,4 +1,4 @@ -use hashbrown::HashMap; +use hashbrown::{HashMap, HashSet}; use serde::{Deserialize, Serialize}; use smallvec::ToSmallVec; @@ -89,11 +89,17 @@ impl AttributeList { ); let t_metadata = dest.get_metadata_mut_chars(&new_word).unwrap(); t_metadata.append(&metadata); - t_metadata.derived_from = Some(WordId::from_word_chars(&word.letters)) + t_metadata + .derived_from + .get_or_insert_with(HashSet::new) + .insert(WordId::from_word_chars(&word.letters)); } } else { for (key, mut value) in new_words.into_iter() { - value.derived_from = Some(WordId::from_word_chars(&word.letters)); + value + .derived_from + .get_or_insert_with(HashSet::new) + .insert(WordId::from_word_chars(&word.letters)); if let Some(val) = dest.get_metadata_mut_chars(&key) { val.append(&value); diff --git a/harper-core/src/spell/word_id.rs b/harper-core/src/spell/word_id.rs index 9140a3911..9780ed089 100644 --- a/harper-core/src/spell/word_id.rs +++ b/harper-core/src/spell/word_id.rs @@ -31,4 +31,8 @@ impl WordId { let chars: CharString = text.as_ref().chars().collect(); Self::from_word_chars(chars) } + + pub fn from_hash(hash: u64) -> Self { + Self { hash } + } } diff --git a/harper-core/src/token_kind.rs b/harper-core/src/token_kind.rs index 2e1ab766f..5803b0330 100644 --- a/harper-core/src/token_kind.rs +++ b/harper-core/src/token_kind.rs @@ -2,8 +2,9 @@ use is_macro::Is; use serde::{Deserialize, Serialize}; use crate::{ConjunctionData, NounData, Number, PronounData, Punctuation, Quote, WordMetadata}; +use std::hash::{Hash, Hasher}; -#[derive(Debug, Is, Clone, Serialize, Deserialize, Default, PartialOrd, Hash, Eq, PartialEq)] +#[derive(Debug, Is, Clone, Serialize, Deserialize, Default, Eq, PartialEq)] #[serde(tag = "kind", content = "value")] pub enum TokenKind { /// `None` if the word does not exist in the dictionary. @@ -26,6 +27,49 @@ pub enum TokenKind { Regexish, } +impl Hash for TokenKind { + fn hash(&self, state: &mut H) { + match self { + TokenKind::Word(metadata) => { + metadata.hash(state); + } + TokenKind::Punctuation(punct) => { + punct.hash(state); + } + TokenKind::Decade => { + 0.hash(state); + } + TokenKind::Number(number) => { + number.hash(state); + } + TokenKind::Space(space) => { + space.hash(state); + } + TokenKind::Newline(newline) => { + newline.hash(state); + } + TokenKind::EmailAddress => { + 0.hash(state); + } + TokenKind::Url => { + 0.hash(state); + } + TokenKind::Hostname => { + 0.hash(state); + } + TokenKind::Unlintable => { + 0.hash(state); + } + TokenKind::ParagraphBreak => { + 0.hash(state); + } + TokenKind::Regexish => { + 0.hash(state); + } + } + } +} + impl TokenKind { pub fn is_open_square(&self) -> bool { matches!(self, TokenKind::Punctuation(Punctuation::OpenSquare)) diff --git a/harper-core/src/word_metadata.rs b/harper-core/src/word_metadata.rs index c8182784a..9aecc24c1 100644 --- a/harper-core/src/word_metadata.rs +++ b/harper-core/src/word_metadata.rs @@ -1,3 +1,6 @@ +use hashbrown::HashSet; +use std::hash::{Hash, Hasher}; + use is_macro::Is; use paste::paste; use serde::{Deserialize, Serialize}; @@ -5,7 +8,7 @@ use strum_macros::{Display, EnumString}; use crate::WordId; -#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize, PartialOrd, Hash)] +#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct WordMetadata { pub noun: Option, pub pronoun: Option, @@ -28,7 +31,28 @@ pub struct WordMetadata { #[serde(default = "default_false")] pub common: bool, #[serde(default = "default_none")] - pub derived_from: Option, + pub derived_from: Option>, +} + +impl Hash for WordMetadata { + fn hash(&self, state: &mut H) { + self.noun.hash(state); + self.pronoun.hash(state); + self.verb.hash(state); + self.adjective.hash(state); + self.adverb.hash(state); + self.conjunction.hash(state); + self.swear.hash(state); + self.dialect.hash(state); + self.determiner.hash(state); + self.preposition.hash(state); + self.common.hash(state); + if let Some(ref derived_from) = self.derived_from { + for id in derived_from.iter() { + id.hash(state); + } + } + } } /// Needed for `serde` @@ -111,7 +135,16 @@ impl WordMetadata { determiner: self.determiner || other.determiner, preposition: self.preposition || other.preposition, common: self.common || other.common, - derived_from: self.derived_from.or(other.derived_from), + derived_from: match (&self.derived_from, &other.derived_from) { + (Some(a), Some(b)) => { + let mut set = a.clone(); + set.extend(b); + Some(set) + } + (Some(a), None) => Some(a.clone()), + (None, Some(b)) => Some(b.clone()), + (None, None) => None, + }, } } From 123fd7f34cd08cea091128dad77ddd705a039c6c Mon Sep 17 00:00:00 2001 From: hippietrail Date: Wed, 30 Jul 2025 10:57:41 +0900 Subject: [PATCH 2/8] chore: merging old branch bit by bit --- harper-core/benches/parse_demo.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/harper-core/benches/parse_demo.rs b/harper-core/benches/parse_demo.rs index 6298590e5..6339cc529 100644 --- a/harper-core/benches/parse_demo.rs +++ b/harper-core/benches/parse_demo.rs @@ -1,6 +1,6 @@ use criterion::{Criterion, black_box, criterion_group, criterion_main}; use harper_core::linting::{LintGroup, Linter}; -use harper_core::{Document, FstDictionary}; +use harper_core::{Dialect, Document, FstDictionary}; static ESSAY: &str = include_str!("./essay.md"); @@ -12,7 +12,7 @@ fn parse_essay(c: &mut Criterion) { fn lint_essay(c: &mut Criterion) { let dictionary = FstDictionary::curated(); - let mut lint_set = LintGroup::new_curated(dictionary); + let mut lint_set = LintGroup::new_curated(dictionary, Dialect::American); let document = Document::new_markdown_default_curated(black_box(ESSAY)); c.bench_function("lint_essay", |b| { @@ -24,7 +24,7 @@ fn lint_essay_uncached(c: &mut Criterion) { c.bench_function("lint_essay_uncached", |b| { b.iter(|| { let dictionary = FstDictionary::curated(); - let mut lint_set = LintGroup::new_curated(dictionary.clone()); + let mut lint_set = LintGroup::new_curated(dictionary.clone(), Dialect::American); let document = Document::new_markdown_default(black_box(ESSAY), &dictionary); lint_set.lint(&document) }) From d852ccf2a021b83d1056ee6e9d7464553b2e79d9 Mon Sep 17 00:00:00 2001 From: hippietrail Date: Wed, 30 Jul 2025 11:10:32 +0900 Subject: [PATCH 3/8] chore: merge old pr branch bit by bit --- harper-core/src/linting/ask_no_preposition.rs | 2 +- harper-core/src/linting/else_possessive.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/harper-core/src/linting/ask_no_preposition.rs b/harper-core/src/linting/ask_no_preposition.rs index 1db5676d5..dce4e6bd5 100644 --- a/harper-core/src/linting/ask_no_preposition.rs +++ b/harper-core/src/linting/ask_no_preposition.rs @@ -1,5 +1,5 @@ use crate::{ - Span, Token, TokenStringExt, + Span, Token, linting::{Lint, LintKind, PatternLinter, Suggestion}, patterns::{Pattern, SequencePattern, WordSet}, }; diff --git a/harper-core/src/linting/else_possessive.rs b/harper-core/src/linting/else_possessive.rs index f86285a42..475b2354d 100644 --- a/harper-core/src/linting/else_possessive.rs +++ b/harper-core/src/linting/else_possessive.rs @@ -1,5 +1,5 @@ use crate::{ - Span, Token, TokenStringExt, + Token, linting::{Lint, LintKind, PatternLinter, Suggestion}, patterns::{Pattern, SequencePattern, WordSet}, }; From cfde7e63dedd2561777cc68b4bd53a921f40bf62 Mon Sep 17 00:00:00 2001 From: hippietrail Date: Wed, 30 Jul 2025 12:14:48 +0900 Subject: [PATCH 4/8] chore: merge old pr branch bit by bit --- harper-core/src/ignored_lints/lint_context.rs | 1 - harper-core/src/word_metadata.rs | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/harper-core/src/ignored_lints/lint_context.rs b/harper-core/src/ignored_lints/lint_context.rs index 10051a38a..6adfad83e 100644 --- a/harper-core/src/ignored_lints/lint_context.rs +++ b/harper-core/src/ignored_lints/lint_context.rs @@ -1,7 +1,6 @@ use std::hash::{DefaultHasher, Hash, Hasher}; use serde::{Deserialize, Serialize}; -use std::hash::{Hash, Hasher}; use crate::{ Document, FatToken, diff --git a/harper-core/src/word_metadata.rs b/harper-core/src/word_metadata.rs index 7916cc156..4a15383cc 100644 --- a/harper-core/src/word_metadata.rs +++ b/harper-core/src/word_metadata.rs @@ -46,7 +46,7 @@ impl Hash for WordMetadata { self.adverb.hash(state); self.conjunction.hash(state); self.swear.hash(state); - self.dialect.hash(state); + self.dialects.hash(state); self.determiner.hash(state); self.preposition.hash(state); self.common.hash(state); From df134446460c799ef32a1072bb1cd77ed21e7bed Mon Sep 17 00:00:00 2001 From: hippietrail Date: Wed, 30 Jul 2025 12:32:17 +0900 Subject: [PATCH 5/8] chore: merge old pr branch bit by bit --- harper-cli/src/main.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/harper-cli/src/main.rs b/harper-cli/src/main.rs index 06b0c0a5b..031705072 100644 --- a/harper-cli/src/main.rs +++ b/harper-cli/src/main.rs @@ -298,7 +298,7 @@ fn main() -> anyhow::Result<()> { .collect(); if !derived_words.is_empty() { - println!("derived_from: {:?}", derived_words); + println!("derived_from: {derived_words:?}"); } } } From f553e04874cf4f22475d58caf46c8a61334fb55b Mon Sep 17 00:00:00 2001 From: hippietrail Date: Wed, 30 Jul 2025 13:20:34 +0900 Subject: [PATCH 6/8] chore: merge old pr branch bit by bit --- harper-core/src/word_metadata.rs | 56 ++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/harper-core/src/word_metadata.rs b/harper-core/src/word_metadata.rs index 771331b86..b5808dd6e 100644 --- a/harper-core/src/word_metadata.rs +++ b/harper-core/src/word_metadata.rs @@ -555,6 +555,62 @@ impl WordMetadata { } } +impl PartialOrd for WordMetadata { + fn partial_cmp(&self, other: &Self) -> Option { + // Compare each field in order until we find a difference + match self.noun.partial_cmp(&other.noun) { + Some(std::cmp::Ordering::Equal) => {} + non_eq => return non_eq, + } + match self.pronoun.partial_cmp(&other.pronoun) { + Some(std::cmp::Ordering::Equal) => {} + non_eq => return non_eq, + } + match self.verb.partial_cmp(&other.verb) { + Some(std::cmp::Ordering::Equal) => {} + non_eq => return non_eq, + } + match self.adjective.partial_cmp(&other.adjective) { + Some(std::cmp::Ordering::Equal) => {} + non_eq => return non_eq, + } + match self.adverb.partial_cmp(&other.adverb) { + Some(std::cmp::Ordering::Equal) => {} + non_eq => return non_eq, + } + match self.conjunction.partial_cmp(&other.conjunction) { + Some(std::cmp::Ordering::Equal) => {} + non_eq => return non_eq, + } + match self.swear.partial_cmp(&other.swear) { + Some(std::cmp::Ordering::Equal) => {} + non_eq => return non_eq, + } + match self.dialects.partial_cmp(&other.dialects) { + Some(std::cmp::Ordering::Equal) => {} + non_eq => return non_eq, + } + match self.determiner.partial_cmp(&other.determiner) { + Some(std::cmp::Ordering::Equal) => {} + non_eq => return non_eq, + } + match self.preposition.partial_cmp(&other.preposition) { + Some(std::cmp::Ordering::Equal) => {} + non_eq => return non_eq, + } + match self.common.partial_cmp(&other.common) { + Some(std::cmp::Ordering::Equal) => {} + non_eq => return non_eq, + } + // Skip derived_from as HashSet doesn't implement PartialOrd + match self.np_member.partial_cmp(&other.np_member) { + Some(std::cmp::Ordering::Equal) => {} + non_eq => return non_eq, + } + self.pos_tag.partial_cmp(&other.pos_tag) + } +} + // These verb forms are morphological variations, distinct from TAM (Tense-Aspect-Mood) // Each form can be used in various TAM combinations: // - Lemma form (infinitive, citation form, dictionary form) From 9403b1d77f8d09700a94873a9fa96815876e7bd9 Mon Sep 17 00:00:00 2001 From: hippietrail Date: Tue, 12 Aug 2025 14:19:00 +0900 Subject: [PATCH 7/8] fix: appease precommit --- harper-cli/src/main.rs | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/harper-cli/src/main.rs b/harper-cli/src/main.rs index 0193b21db..8f7ee933e 100644 --- a/harper-cli/src/main.rs +++ b/harper-cli/src/main.rs @@ -370,17 +370,17 @@ fn main() -> anyhow::Result<()> { println!("{json}"); // iterate through any and all derived_from and resolve the word from each wordid - if let Some(metadata) = dictionary.get_word_metadata_str(&word) { - if let Some(derived_from) = &metadata.derived_from { - let derived_words: Vec = derived_from - .iter() - .filter_map(|wordid| dictionary.get_word_from_id(wordid)) - .map(|word| word.iter().collect()) - .collect(); - - if !derived_words.is_empty() { - println!("derived_from: {derived_words:?}"); - } + if let Some(metadata) = dictionary.get_word_metadata_str(&word) + && let Some(derived_from) = &metadata.derived_from + { + let derived_words: Vec = derived_from + .iter() + .filter_map(|wordid| dictionary.get_word_from_id(wordid)) + .map(|word| word.iter().collect()) + .collect(); + + if !derived_words.is_empty() { + println!("derived_from: {derived_words:?}"); } } From f8f5c55d55f3b79f96fbea138ae6c66b74040cf2 Mon Sep 17 00:00:00 2001 From: hippietrail Date: Wed, 3 Sep 2025 03:22:52 +0900 Subject: [PATCH 8/8] chore: merge with upstream --- harper-cli/src/main.rs | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/harper-cli/src/main.rs b/harper-cli/src/main.rs index b1c908bd6..ab072480a 100644 --- a/harper-cli/src/main.rs +++ b/harper-cli/src/main.rs @@ -368,7 +368,7 @@ fn main() -> anyhow::Result<()> { for word in words { let metadata = dictionary.get_word_metadata_str(&word); let mut metadata_value = serde_json::to_value(metadata).unwrap_or_default(); - + // If there are derived words, add them to the metadata if let Some(metadata) = dictionary.get_word_metadata_str(&word) && let Some(derived_from) = &metadata.derived_from @@ -379,13 +379,16 @@ fn main() -> anyhow::Result<()> { .map(|word| word.iter().collect()) .collect(); - if !derived_words.is_empty() { - if let Some(obj) = metadata_value.as_object_mut() { - obj.insert("derived_from_words".to_string(), serde_json::json!(derived_words)); - } + if !derived_words.is_empty() + && let Some(obj) = metadata_value.as_object_mut() + { + obj.insert( + "derived_from_words".to_string(), + serde_json::json!(derived_words), + ); } } - + results.insert(word, metadata_value); } let json = serde_json::to_string_pretty(&results).unwrap();