Automattic · elijah-potter · Sep 8, 2025 · Sep 2, 2025 · Sep 3, 2025 · Sep 4, 2025
diff --git a/harper-core/src/spell/rune/attribute_list.rs b/harper-core/src/spell/rune/attribute_list.rs
@@ -14,7 +14,7 @@ use super::expansion::{
 use super::word_list::MarkedWord;
 use crate::spell::WordId;
 use crate::word_metadata_orthography::OrthFlags;
-use crate::{CharString, Span, WordMetadata};
+use crate::{CharString, CharStringExt, Span, WordMetadata};
 
 #[derive(Debug, Clone)]
 pub struct AttributeList {
@@ -333,9 +333,100 @@ fn check_orthography(word: &MarkedWord) -> OrthFlags {
         }
     }
 
+    if looks_like_roman_numerals(&word.letters)
+        && is_really_roman_numerals(&word.letters.to_lower())
+    {
+        ortho_flags |= OrthFlags::ROMAN_NUMERALS;
+    }
+
     ortho_flags
 }
 
+fn looks_like_roman_numerals(word: &CharString) -> bool {
+    let mut is_roman = false;
+    let first_char_upper;
+
+    if let Some((&first, rest)) = word.split_first()
+        && "mdclxvi".contains(first.to_ascii_lowercase())
+    {
+        first_char_upper = first.is_uppercase();
+
+        for &c in rest {
+            if !"mdclxvi".contains(c.to_ascii_lowercase()) || c.is_uppercase() != first_char_upper {
+                return false;
+            }
+        }
+        is_roman = true;
+    }
+    is_roman
+}
+
+fn is_really_roman_numerals(word: &[char]) -> bool {
+    let s: String = word.iter().collect();
+    let mut chars = s.chars().peekable();
+
+    let mut m_count = 0;
+    while m_count < 4 && chars.peek() == Some(&'m') {
+        chars.next();
+        m_count += 1;
+    }
+
+    if !check_roman_group(&mut chars, 'c', 'd', 'm') {
+        return false;
+    }
+
+    if !check_roman_group(&mut chars, 'x', 'l', 'c') {
+        return false;
+    }
+
+    if !check_roman_group(&mut chars, 'i', 'v', 'x') {
+        return false;
+    }
+
+    if chars.next().is_some() {
+        return false;
+    }
+
+    true
+}
+
+fn check_roman_group<I: Iterator<Item = char>>(
+    chars: &mut std::iter::Peekable<I>,
+    one: char,
+    five: char,
+    ten: char,
+) -> bool {
+    match chars.peek() {
+        Some(&c) if c == one => {
+            chars.next();
+            match chars.peek() {
+                Some(&next) if next == ten || next == five => {
+                    chars.next();
+                    true
+                }
+                _ => {
+                    let mut count = 0;
+                    while count < 2 && chars.peek() == Some(&one) {
+                        chars.next();
+                        count += 1;
+                    }
+                    true
+                }
+            }
+        }
+        Some(&c) if c == five => {
+            chars.next();
+            let mut count = 0;
+            while count < 3 && chars.peek() == Some(&one) {
+                chars.next();
+                count += 1;
+            }
+            true
+        }
+        _ => true,
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -442,6 +533,38 @@ mod tests {
         // Needs at least 3 chars
         assert!(!check_orthography_str("Hi").contains(OrthFlags::UPPER_CAMEL));
     }
+
+    #[test]
+    fn test_roman_numerals() {
+        assert!(check_orthography_str("MCMXCIV").contains(OrthFlags::ROMAN_NUMERALS));
+        assert!(check_orthography_str("mdccclxxi").contains(OrthFlags::ROMAN_NUMERALS));
+        assert!(check_orthography_str("MMXXI").contains(OrthFlags::ROMAN_NUMERALS));
+        assert!(check_orthography_str("mcmxciv").contains(OrthFlags::ROMAN_NUMERALS));
+        assert!(check_orthography_str("MCMXCIV").contains(OrthFlags::ROMAN_NUMERALS));
+        assert!(check_orthography_str("MMI").contains(OrthFlags::ROMAN_NUMERALS));
+        assert!(check_orthography_str("MMXXV").contains(OrthFlags::ROMAN_NUMERALS));
+    }
+
+    #[test]
+    fn test_single_roman_numeral() {
+        assert!(check_orthography_str("i").contains(OrthFlags::ROMAN_NUMERALS));
+    }
+
+    #[test]
+    fn empty_string_is_not_roman_numeral() {
+        assert!(!check_orthography_str("").contains(OrthFlags::ROMAN_NUMERALS));
+    }
+
+    #[test]
+    fn dont_allow_mixed_case_roman_numerals() {
+        assert!(!check_orthography_str("MCMlxxxVIII").contains(OrthFlags::ROMAN_NUMERALS));
+    }
+
+    #[test]
+    fn dont_allow_looks_like_but_isnt_roman_numeral() {
+        assert!(!check_orthography_str("mdxlivx").contains(OrthFlags::ROMAN_NUMERALS));
+        assert!(!check_orthography_str("XIXIVV").contains(OrthFlags::ROMAN_NUMERALS));
+    }
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize)]

diff --git a/harper-core/src/token_kind.rs b/harper-core/src/token_kind.rs
@@ -125,7 +125,9 @@ impl TokenKind {
         is_allcaps,
         is_lower_camel,
         is_upper_camel,
-        is_apostrophized
+        is_apostrophized,
+
+        is_roman_numerals
     }
 
     // Word metadata delegation methods not generated by macro

diff --git a/harper-core/src/word_metadata.rs b/harper-core/src/word_metadata.rs
@@ -721,6 +721,10 @@ impl WordMetadata {
         self.orth_info.contains(OrthFlags::APOSTROPHE)
     }
 
+    pub fn is_roman_numerals(&self) -> bool {
+        self.orth_info.contains(OrthFlags::ROMAN_NUMERALS)
+    }
+
     /// Same thing as [`Self::or`], except in-place rather than a clone.
     pub fn append(&mut self, other: &Self) -> &mut Self {
         *self = self.or(other);

diff --git a/harper-core/src/word_metadata_orthography.rs b/harper-core/src/word_metadata_orthography.rs
@@ -18,12 +18,14 @@ pub enum Orthography {
     Hyphenated = 1 << 6,
     /// Contains an apostrophe, so it's a possessive or a contraction.
     Apostrophe = 1 << 7,
+    /// Could be Roman numerals.
+    RomanNumerals = 1 << 8,
 }
 
 /// The underlying type used for OrthographyFlags.
 /// At the time of writing, this is currently a `u8`. If we want to define more than 8 orthographic
 /// properties in the future, we will need to switch this to a larger type.
-type OrthographyFlagsUnderlyingType = u8;
+type OrthographyFlagsUnderlyingType = u16;
 
 bitflags::bitflags! {
     /// A collection of bit flags used to represent orthographic properties of a word.
@@ -40,6 +42,7 @@ bitflags::bitflags! {
         const MULTIWORD = Orthography::Multiword as OrthographyFlagsUnderlyingType;
         const HYPHENATED = Orthography::Hyphenated as OrthographyFlagsUnderlyingType;
         const APOSTROPHE = Orthography::Apostrophe as OrthographyFlagsUnderlyingType;
+        const ROMAN_NUMERALS = Orthography::RomanNumerals as OrthographyFlagsUnderlyingType;
     }
 }
 impl Default for OrthFlags {

diff --git a/harper-core/tests/pos_tags.rs b/harper-core/tests/pos_tags.rs
@@ -71,6 +71,7 @@
 //! - [`TokenKind::Punctuation`] are denoted by `.`.
 //! - [`TokenKind::Number`] are denoted by `#`.
 //! - [`TokenKind::Decade`] are denoted by `#d`.
+//! - Roman numerals are denoted by `#r`.
 //! - [`TokenKind::Space`], [`TokenKind::Newline`], and
 //!   [`TokenKind::ParagraphBreak`] are ignored.
 //! - All other token kinds are denoted by their variant name.
@@ -210,6 +211,9 @@ fn format_word_tag(word: &WordMetadata) -> String {
     if word.preposition {
         add("P", &mut tags);
     }
+    if word.is_roman_numerals() {
+        add("#r", &mut tags);
+    }
 
     get_dialect_annotations(word).into_iter().for_each(|tag| {
         add(tag, &mut tags);