Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 124 additions & 1 deletion harper-core/src/spell/rune/attribute_list.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ use super::expansion::{
use super::word_list::MarkedWord;
use crate::spell::WordId;
use crate::word_metadata_orthography::OrthFlags;
use crate::{CharString, Span, WordMetadata};
use crate::{CharString, CharStringExt, Span, WordMetadata};

#[derive(Debug, Clone)]
pub struct AttributeList {
Expand Down Expand Up @@ -333,9 +333,100 @@ fn check_orthography(word: &MarkedWord) -> OrthFlags {
}
}

if looks_like_roman_numerals(&word.letters)
&& is_really_roman_numerals(&word.letters.to_lower())
{
ortho_flags |= OrthFlags::ROMAN_NUMERALS;
}

ortho_flags
}

fn looks_like_roman_numerals(word: &CharString) -> bool {
let mut is_roman = false;
let first_char_upper;

if let Some((&first, rest)) = word.split_first()
&& "mdclxvi".contains(first.to_ascii_lowercase())
{
first_char_upper = first.is_uppercase();

for &c in rest {
if !"mdclxvi".contains(c.to_ascii_lowercase()) || c.is_uppercase() != first_char_upper {
return false;
}
}
is_roman = true;
}
is_roman
}

fn is_really_roman_numerals(word: &[char]) -> bool {
let s: String = word.iter().collect();
let mut chars = s.chars().peekable();

let mut m_count = 0;
while m_count < 4 && chars.peek() == Some(&'m') {
chars.next();
m_count += 1;
}

if !check_roman_group(&mut chars, 'c', 'd', 'm') {
return false;
}

if !check_roman_group(&mut chars, 'x', 'l', 'c') {
return false;
}

if !check_roman_group(&mut chars, 'i', 'v', 'x') {
return false;
}

if chars.next().is_some() {
return false;
}

true
}

fn check_roman_group<I: Iterator<Item = char>>(
chars: &mut std::iter::Peekable<I>,
one: char,
five: char,
ten: char,
) -> bool {
match chars.peek() {
Some(&c) if c == one => {
chars.next();
match chars.peek() {
Some(&next) if next == ten || next == five => {
chars.next();
true
}
_ => {
let mut count = 0;
while count < 2 && chars.peek() == Some(&one) {
chars.next();
count += 1;
}
true
}
}
}
Some(&c) if c == five => {
chars.next();
let mut count = 0;
while count < 3 && chars.peek() == Some(&one) {
chars.next();
count += 1;
}
true
}
_ => true,
}
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down Expand Up @@ -442,6 +533,38 @@ mod tests {
// Needs at least 3 chars
assert!(!check_orthography_str("Hi").contains(OrthFlags::UPPER_CAMEL));
}

#[test]
fn test_roman_numerals() {
assert!(check_orthography_str("MCMXCIV").contains(OrthFlags::ROMAN_NUMERALS));
assert!(check_orthography_str("mdccclxxi").contains(OrthFlags::ROMAN_NUMERALS));
assert!(check_orthography_str("MMXXI").contains(OrthFlags::ROMAN_NUMERALS));
assert!(check_orthography_str("mcmxciv").contains(OrthFlags::ROMAN_NUMERALS));
assert!(check_orthography_str("MCMXCIV").contains(OrthFlags::ROMAN_NUMERALS));
assert!(check_orthography_str("MMI").contains(OrthFlags::ROMAN_NUMERALS));
assert!(check_orthography_str("MMXXV").contains(OrthFlags::ROMAN_NUMERALS));
}

#[test]
fn test_single_roman_numeral() {
assert!(check_orthography_str("i").contains(OrthFlags::ROMAN_NUMERALS));
}

#[test]
fn empty_string_is_not_roman_numeral() {
assert!(!check_orthography_str("").contains(OrthFlags::ROMAN_NUMERALS));
}

#[test]
fn dont_allow_mixed_case_roman_numerals() {
assert!(!check_orthography_str("MCMlxxxVIII").contains(OrthFlags::ROMAN_NUMERALS));
}

#[test]
fn dont_allow_looks_like_but_isnt_roman_numeral() {
assert!(!check_orthography_str("mdxlivx").contains(OrthFlags::ROMAN_NUMERALS));
assert!(!check_orthography_str("XIXIVV").contains(OrthFlags::ROMAN_NUMERALS));
}
}

#[derive(Debug, Clone, Serialize, Deserialize)]
Expand Down
4 changes: 3 additions & 1 deletion harper-core/src/token_kind.rs
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,9 @@ impl TokenKind {
is_allcaps,
is_lower_camel,
is_upper_camel,
is_apostrophized
is_apostrophized,

is_roman_numerals
}

// Word metadata delegation methods not generated by macro
Expand Down
4 changes: 4 additions & 0 deletions harper-core/src/word_metadata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -721,6 +721,10 @@ impl WordMetadata {
self.orth_info.contains(OrthFlags::APOSTROPHE)
}

pub fn is_roman_numerals(&self) -> bool {
self.orth_info.contains(OrthFlags::ROMAN_NUMERALS)
}

/// Same thing as [`Self::or`], except in-place rather than a clone.
pub fn append(&mut self, other: &Self) -> &mut Self {
*self = self.or(other);
Expand Down
5 changes: 4 additions & 1 deletion harper-core/src/word_metadata_orthography.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,14 @@ pub enum Orthography {
Hyphenated = 1 << 6,
/// Contains an apostrophe, so it's a possessive or a contraction.
Apostrophe = 1 << 7,
/// Could be Roman numerals.
RomanNumerals = 1 << 8,
}

/// The underlying type used for OrthographyFlags.
/// At the time of writing, this is currently a `u8`. If we want to define more than 8 orthographic
/// properties in the future, we will need to switch this to a larger type.
type OrthographyFlagsUnderlyingType = u8;
type OrthographyFlagsUnderlyingType = u16;

bitflags::bitflags! {
/// A collection of bit flags used to represent orthographic properties of a word.
Expand All @@ -40,6 +42,7 @@ bitflags::bitflags! {
const MULTIWORD = Orthography::Multiword as OrthographyFlagsUnderlyingType;
const HYPHENATED = Orthography::Hyphenated as OrthographyFlagsUnderlyingType;
const APOSTROPHE = Orthography::Apostrophe as OrthographyFlagsUnderlyingType;
const ROMAN_NUMERALS = Orthography::RomanNumerals as OrthographyFlagsUnderlyingType;
}
}
impl Default for OrthFlags {
Expand Down
4 changes: 4 additions & 0 deletions harper-core/tests/pos_tags.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@
//! - [`TokenKind::Punctuation`] are denoted by `.`.
//! - [`TokenKind::Number`] are denoted by `#`.
//! - [`TokenKind::Decade`] are denoted by `#d`.
//! - Roman numerals are denoted by `#r`.
//! - [`TokenKind::Space`], [`TokenKind::Newline`], and
//! [`TokenKind::ParagraphBreak`] are ignored.
//! - All other token kinds are denoted by their variant name.
Expand Down Expand Up @@ -210,6 +211,9 @@ fn format_word_tag(word: &WordMetadata) -> String {
if word.preposition {
add("P", &mut tags);
}
if word.is_roman_numerals() {
add("#r", &mut tags);
}

get_dialect_annotations(word).into_iter().for_each(|tag| {
add(tag, &mut tags);
Expand Down
Loading
Loading