diff --git a/harper-cli/src/main.rs b/harper-cli/src/main.rs index d6791dec8..16cbe23d8 100644 --- a/harper-cli/src/main.rs +++ b/harper-cli/src/main.rs @@ -103,6 +103,8 @@ enum Args { /// The document to mine words from. file: PathBuf, }, + /// Get the word associated with a particular word id. + WordFromId { hash: u64 }, #[cfg(feature = "training")] TrainBrillTagger { #[arg(short, long, default_value = "1.0")] @@ -368,7 +370,29 @@ fn main() -> anyhow::Result<()> { let mut results = BTreeMap::new(); for word in words { let metadata = dictionary.get_word_metadata_str(&word); - results.insert(word, metadata); + let mut metadata_value = serde_json::to_value(metadata).unwrap_or_default(); + + // If there are derived words, add them to the metadata + if let Some(metadata) = dictionary.get_word_metadata_str(&word) + && let Some(derived_from) = &metadata.derived_from + { + let derived_words: Vec = derived_from + .iter() + .filter_map(|wordid| dictionary.get_word_from_id(wordid)) + .map(|word| word.iter().collect()) + .collect(); + + if !derived_words.is_empty() + && let Some(obj) = metadata_value.as_object_mut() + { + obj.insert( + "derived_from_words".to_string(), + serde_json::json!(derived_words), + ); + } + } + + results.insert(word, metadata_value); } let json = serde_json::to_string_pretty(&results).unwrap(); println!("{json}"); @@ -502,6 +526,11 @@ fn main() -> anyhow::Result<()> { Ok(()) } + Args::WordFromId { hash } => { + let id = WordId::from_hash(hash); + println!("{:?}", dictionary.get_word_from_id(&id)); + Ok(()) + } Args::CoreVersion => { println!("harper-core v{}", harper_core::core_version()); Ok(()) @@ -853,9 +882,14 @@ fn print_word_derivations(word: &str, annot: &str, dictionary: &impl Dictionary) let id = WordId::from_word_str(word); - let children = dictionary - .words_iter() - .filter(|e| dictionary.get_word_metadata(e).unwrap().derived_from == Some(id)); + let children = dictionary.words_iter().filter(|e| { + dictionary + .get_word_metadata(e) + .unwrap() + .derived_from + .as_ref() + .is_some_and(|derived| derived.contains(&id)) + }); println!(" - {word}"); diff --git a/harper-core/src/fat_token.rs b/harper-core/src/fat_token.rs index 00770ac19..08caebbef 100644 --- a/harper-core/src/fat_token.rs +++ b/harper-core/src/fat_token.rs @@ -4,7 +4,7 @@ use crate::{CharStringExt, TokenKind}; /// A [`Token`](crate::Token) that holds its content as a fat [`Vec`] rather than as a /// [`Span`](crate::Span). -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, PartialOrd, Hash, Eq)] +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Hash, Eq)] pub struct FatToken { pub content: Vec, pub kind: TokenKind, @@ -20,7 +20,7 @@ impl From for FatToken { } /// Similar to a [`FatToken`], but uses a [`String`] as the underlying store. -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, PartialOrd, Hash, Eq)] +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] pub struct FatStringToken { pub content: String, pub kind: TokenKind, diff --git a/harper-core/src/ignored_lints/lint_context.rs b/harper-core/src/ignored_lints/lint_context.rs index 465cae681..6adfad83e 100644 --- a/harper-core/src/ignored_lints/lint_context.rs +++ b/harper-core/src/ignored_lints/lint_context.rs @@ -9,7 +9,7 @@ use crate::{ /// A location-agnostic structure that attempts to captures the context and content that a [`Lint`] /// occurred. -#[derive(Debug, Hash, Serialize, Deserialize)] +#[derive(Debug, Serialize, Deserialize)] pub struct LintContext { pub lint_kind: LintKind, pub suggestions: Vec, @@ -18,6 +18,16 @@ pub struct LintContext { pub tokens: Vec, } +impl Hash for LintContext { + fn hash(&self, state: &mut H) { + self.lint_kind.hash(state); + self.suggestions.hash(state); + self.message.hash(state); + self.priority.hash(state); + self.tokens.hash(state); + } +} + impl LintContext { pub fn from_lint(lint: &Lint, document: &Document) -> Self { let Lint { diff --git a/harper-core/src/spell/fst_dictionary.rs b/harper-core/src/spell/fst_dictionary.rs index a558deb4b..080320402 100644 --- a/harper-core/src/spell/fst_dictionary.rs +++ b/harper-core/src/spell/fst_dictionary.rs @@ -306,26 +306,27 @@ mod tests { #[test] fn plural_llamas_derived_from_llama() { let dict = FstDictionary::curated(); - - assert_eq!( + assert!( dict.get_word_metadata_str("llamas") .unwrap() .derived_from - .unwrap(), - WordId::from_word_str("llama") - ) + .as_ref() + .unwrap() + .contains(&WordId::from_word_str("llama")) + ); } #[test] fn plural_cats_derived_from_cat() { let dict = FstDictionary::curated(); - assert_eq!( + assert!( dict.get_word_metadata_str("cats") .unwrap() .derived_from - .unwrap(), - WordId::from_word_str("cat") + .as_ref() + .unwrap() + .contains(&WordId::from_word_str("cat")) ); } @@ -333,12 +334,13 @@ mod tests { fn unhappy_derived_from_happy() { let dict = FstDictionary::curated(); - assert_eq!( + assert!( dict.get_word_metadata_str("unhappy") .unwrap() .derived_from - .unwrap(), - WordId::from_word_str("happy") + .as_ref() + .unwrap() + .contains(&WordId::from_word_str("happy")) ); } @@ -346,12 +348,13 @@ mod tests { fn quickly_derived_from_quick() { let dict = FstDictionary::curated(); - assert_eq!( + assert!( dict.get_word_metadata_str("quickly") .unwrap() .derived_from - .unwrap(), - WordId::from_word_str("quick") + .as_ref() + .unwrap() + .contains(&WordId::from_word_str("quick")) ); } } diff --git a/harper-core/src/spell/mod.rs b/harper-core/src/spell/mod.rs index 45c4f64e1..44999b7b4 100644 --- a/harper-core/src/spell/mod.rs +++ b/harper-core/src/spell/mod.rs @@ -17,7 +17,7 @@ mod rune; mod word_id; mod word_map; -#[derive(PartialEq, Debug, Hash, Eq)] +#[derive(PartialEq, Debug, Eq)] pub struct FuzzyMatchResult<'a> { pub word: &'a [char], pub edit_distance: u8, diff --git a/harper-core/src/spell/rune/attribute_list.rs b/harper-core/src/spell/rune/attribute_list.rs index ee65d7ec4..e5eb19ab6 100644 --- a/harper-core/src/spell/rune/attribute_list.rs +++ b/harper-core/src/spell/rune/attribute_list.rs @@ -1,4 +1,4 @@ -use hashbrown::HashMap; +use hashbrown::{HashMap, HashSet}; use serde::{Deserialize, Serialize}; use smallvec::ToSmallVec; @@ -124,11 +124,17 @@ impl AttributeList { ); let t_metadata = dest.get_metadata_mut_chars(&new_word).unwrap(); t_metadata.append(&metadata); - t_metadata.derived_from = Some(WordId::from_word_chars(&word.letters)) + t_metadata + .derived_from + .get_or_insert_with(HashSet::new) + .insert(WordId::from_word_chars(&word.letters)); } } else { for (key, mut value) in new_words.into_iter() { - value.derived_from = Some(WordId::from_word_chars(&word.letters)); + value + .derived_from + .get_or_insert_with(HashSet::new) + .insert(WordId::from_word_chars(&word.letters)); if let Some(val) = dest.get_metadata_mut_chars(&key) { val.append(&value); diff --git a/harper-core/src/spell/word_id.rs b/harper-core/src/spell/word_id.rs index 9140a3911..9780ed089 100644 --- a/harper-core/src/spell/word_id.rs +++ b/harper-core/src/spell/word_id.rs @@ -31,4 +31,8 @@ impl WordId { let chars: CharString = text.as_ref().chars().collect(); Self::from_word_chars(chars) } + + pub fn from_hash(hash: u64) -> Self { + Self { hash } + } } diff --git a/harper-core/src/token_kind.rs b/harper-core/src/token_kind.rs index 76615bbf3..f2e989b6f 100644 --- a/harper-core/src/token_kind.rs +++ b/harper-core/src/token_kind.rs @@ -2,7 +2,9 @@ use harper_brill::UPOS; use is_macro::Is; use serde::{Deserialize, Serialize}; -use crate::{Number, Punctuation, Quote, TokenKind::Word, WordMetadata}; +use crate::TokenKind::Word; +use crate::{Number, Punctuation, Quote, WordMetadata}; +use std::hash::{Hash, Hasher}; /// Generate wrapper code to pass a function call to the inner [`WordMetadata`], /// if the token is indeed a word, while also emitting method-level documentation. @@ -29,7 +31,7 @@ macro_rules! delegate_to_metadata { /// Has a variety of queries available. /// If there is a query missing, it may be easy to implement by just calling the /// `delegate_to_metadata` macro. -#[derive(Debug, Is, Clone, Serialize, Deserialize, Default, PartialOrd, Hash, Eq, PartialEq)] +#[derive(Debug, Is, Clone, Serialize, Deserialize, Default, PartialOrd, Eq, PartialEq)] #[serde(tag = "kind", content = "value")] pub enum TokenKind { /// `None` if the word does not exist in the dictionary. @@ -52,6 +54,49 @@ pub enum TokenKind { Regexish, } +impl Hash for TokenKind { + fn hash(&self, state: &mut H) { + match self { + TokenKind::Word(metadata) => { + metadata.hash(state); + } + TokenKind::Punctuation(punct) => { + punct.hash(state); + } + TokenKind::Decade => { + 0.hash(state); + } + TokenKind::Number(number) => { + number.hash(state); + } + TokenKind::Space(space) => { + space.hash(state); + } + TokenKind::Newline(newline) => { + newline.hash(state); + } + TokenKind::EmailAddress => { + 0.hash(state); + } + TokenKind::Url => { + 0.hash(state); + } + TokenKind::Hostname => { + 0.hash(state); + } + TokenKind::Unlintable => { + 0.hash(state); + } + TokenKind::ParagraphBreak => { + 0.hash(state); + } + TokenKind::Regexish => { + 0.hash(state); + } + } + } +} + impl TokenKind { // Word metadata delegation methods grouped by part of speech delegate_to_metadata! { diff --git a/harper-core/src/word_metadata.rs b/harper-core/src/word_metadata.rs index 36603859b..8cd7125d9 100644 --- a/harper-core/src/word_metadata.rs +++ b/harper-core/src/word_metadata.rs @@ -1,3 +1,6 @@ +use hashbrown::HashSet; +use std::hash::{Hash, Hasher}; + use harper_brill::UPOS; use is_macro::Is; use itertools::Itertools; @@ -16,7 +19,7 @@ use crate::{Document, TokenKind, TokenStringExt}; /// This represents a "lexeme" or "headword" which is case-folded but affix-expanded. /// So not only lemmata but also inflected forms are stored here, with "horn" and "horns" each /// having their own lexeme, but "Ivy" and "ivy" share the same lexeme. -#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize, PartialOrd, Hash)] +#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct WordMetadata { pub noun: Option, pub pronoun: Option, @@ -42,13 +45,34 @@ pub struct WordMetadata { #[serde(default = "default_false")] pub common: bool, #[serde(default = "default_none")] - pub derived_from: Option, + pub derived_from: Option>, /// Generated by a chunker pub np_member: Option, /// Generated by a POS tagger pub pos_tag: Option, } +impl Hash for WordMetadata { + fn hash(&self, state: &mut H) { + self.noun.hash(state); + self.pronoun.hash(state); + self.verb.hash(state); + self.adjective.hash(state); + self.adverb.hash(state); + self.conjunction.hash(state); + self.swear.hash(state); + self.dialects.hash(state); + self.determiner.hash(state); + self.preposition.hash(state); + self.common.hash(state); + if let Some(ref derived_from) = self.derived_from { + for id in derived_from.iter() { + id.hash(state); + } + } + } +} + /// Needed for `serde` fn default_false() -> bool { false @@ -192,7 +216,16 @@ impl WordMetadata { determiner: merge!(self.determiner, other.determiner), preposition: self.preposition || other.preposition, common: self.common || other.common, - derived_from: self.derived_from.or(other.derived_from), + derived_from: match (&self.derived_from, &other.derived_from) { + (Some(a), Some(b)) => { + let mut set = a.clone(); + set.extend(b); + Some(set) + } + (Some(a), None) => Some(a.clone()), + (None, Some(b)) => Some(b.clone()), + (None, None) => None, + }, pos_tag: self.pos_tag.or(other.pos_tag), np_member: self.np_member.or(other.np_member), } @@ -732,6 +765,62 @@ impl WordMetadata { } } +impl PartialOrd for WordMetadata { + fn partial_cmp(&self, other: &Self) -> Option { + // Compare each field in order until we find a difference + match self.noun.partial_cmp(&other.noun) { + Some(std::cmp::Ordering::Equal) => {} + non_eq => return non_eq, + } + match self.pronoun.partial_cmp(&other.pronoun) { + Some(std::cmp::Ordering::Equal) => {} + non_eq => return non_eq, + } + match self.verb.partial_cmp(&other.verb) { + Some(std::cmp::Ordering::Equal) => {} + non_eq => return non_eq, + } + match self.adjective.partial_cmp(&other.adjective) { + Some(std::cmp::Ordering::Equal) => {} + non_eq => return non_eq, + } + match self.adverb.partial_cmp(&other.adverb) { + Some(std::cmp::Ordering::Equal) => {} + non_eq => return non_eq, + } + match self.conjunction.partial_cmp(&other.conjunction) { + Some(std::cmp::Ordering::Equal) => {} + non_eq => return non_eq, + } + match self.swear.partial_cmp(&other.swear) { + Some(std::cmp::Ordering::Equal) => {} + non_eq => return non_eq, + } + match self.dialects.partial_cmp(&other.dialects) { + Some(std::cmp::Ordering::Equal) => {} + non_eq => return non_eq, + } + match self.determiner.partial_cmp(&other.determiner) { + Some(std::cmp::Ordering::Equal) => {} + non_eq => return non_eq, + } + match self.preposition.partial_cmp(&other.preposition) { + Some(std::cmp::Ordering::Equal) => {} + non_eq => return non_eq, + } + match self.common.partial_cmp(&other.common) { + Some(std::cmp::Ordering::Equal) => {} + non_eq => return non_eq, + } + // Skip derived_from as HashSet doesn't implement PartialOrd + match self.np_member.partial_cmp(&other.np_member) { + Some(std::cmp::Ordering::Equal) => {} + non_eq => return non_eq, + } + self.pos_tag.partial_cmp(&other.pos_tag) + } +} + // These verb forms are morphological variations, distinct from TAM (Tense-Aspect-Mood) // Each form can be used in various TAM combinations: // - Lemma form (infinitive, citation form, dictionary form)