Change the MWT dictionary lookup to only look for lowercasing if the original word matches one of a couple expected casing formats, in which case we can recreate those formats after using the dictionary lookup. Otherwise, you get unexpected tokenizations such as She's -> she 's. #1371

AngledLuffa · AngledLuffa · commit 5800e02a8df1 · 2024-04-03T15:24:53.000-07:00
diff --git a/stanza/models/mwt/trainer.py b/stanza/models/mwt/trainer.py
@@ -111,29 +111,51 @@ def train_dict(self, pairs):
             seen.add(w)
         return
 
+    def dict_expansion(self, word):
+        """
+        Check the expansion dictionary for the word along with a couple common lowercasings of the word
+
+        (Leadingcase and UPPERCASE)
+        """
+        expansion = self.expansion_dict.get(word)
+        if expansion is not None:
+            return expansion
+
+        if word.isupper():
+            expansion = self.expansion_dict.get(word.lower())
+            if expansion is not None:
+                return expansion.upper()
+
+        if word[0].isupper() and word[1:].islower():
+            expansion = self.expansion_dict.get(word.lower())
+            if expansion is not None:
+                return expansion[0].upper() + expansion[1:]
+
+        # could build a truecasing model of some kind to handle cRaZyCaSe...
+        # but that's probably too much effort
+        return None
+
     def predict_dict(self, words):
         """ Predict a list of expansions given words. """
         expansions = []
         for w in words:
-            if w in self.expansion_dict:
-                expansions += [self.expansion_dict[w]]
-            elif w.lower() in self.expansion_dict:
-                expansions += [self.expansion_dict[w.lower()]]
+            expansion = self.dict_expansion(w)
+            if expansion is not None:
+                expansions.append(expansion)
             else:
-                expansions += [w]
+                expansions.append(w)
         return expansions
 
     def ensemble(self, cands, other_preds):
         """ Ensemble the dict with statistical model predictions. """
         expansions = []
         assert len(cands) == len(other_preds)
         for c, pred in zip(cands, other_preds):
-            if c in self.expansion_dict:
-                expansions += [self.expansion_dict[c]]
-            elif c.lower() in self.expansion_dict:
-                expansions += [self.expansion_dict[c.lower()]]
+            expansion = self.dict_expansion(c)
+            if expansion is not None:
+                expansions.append(expansion)
             else:
-                expansions += [pred]
+                expansions.append(pred)
         return expansions
 
     def save(self, filename):
diff --git a/stanza/tests/mwt/test_english_corner_cases.py b/stanza/tests/mwt/test_english_corner_cases.py
@@ -1,9 +1,14 @@
 """
-Test that an unknown English character doesn't result in bizarre splits
+Test a couple English MWT corner cases which might be more widely applicable to other MWT languages
+
+- unknown English character doesn't result in bizarre splits
+- Casing or CASING doesn't get lost in the dictionary lookup
 
 In the English UD datasets, the MWT are composed exactly of the
 subwords, so the MWT model should be chopping up the input text rather
 than generating new text.
+
+Furthermore, SHE'S and She's should be split "SHE 'S" and "She 's" respectively
 """
 
 import pytest
@@ -40,3 +45,44 @@ def test_mwt_unknown_char():
     assert doc.sentences[0].tokens[3].text == possessive
     assert len(doc.sentences[0].tokens[3].words) == 2
     assert "".join(x.text for x in doc.sentences[0].tokens[3].words) == possessive
+
+
+def test_english_mwt_casing():
+    """
+    Test that for a word where the lowercase split is known, the correct casing is still used
+
+    Once upon a time, the logic used in the MWT expander would split
+      SHE'S -> she 's
+
+    which is a very surprising tokenization to people expecting
+    the original text in the output document
+    """
+    pipeline = stanza.Pipeline(processors='tokenize,mwt', dir=TEST_MODELS_DIR, lang='en', download_method=None)
+
+    mwt_trainer = pipeline.processors['mwt']._trainer
+    for i in range(1, 20):
+        # many test cases follow this pattern for some reason,
+        # so we should proactively look for a test case which hasn't
+        # made its way into the MWT dictionary
+        unknown_name = "jennife" + "r" * i + "'s"
+        if unknown_name not in mwt_trainer.expansion_dict and unknown_name.upper() not in mwt_trainer.expansion_dict:
+            unknown_name = unknown_name.upper()
+            break
+    else:
+        raise AssertionError("Need a new heuristic for the unknown word in the English MWT!")
+
+    # this SHOULD show up in the expansion dict
+    assert "she's" in mwt_trainer.expansion_dict, "Expected |she's| to be in the English MWT expansion dict... perhaps find a different test case"
+
+    text = [x.text for x in pipeline("JENNIFER HAS NICE ANTENNAE").sentences[0].words]
+    assert text == ['JENNIFER', 'HAS', 'NICE', 'ANTENNAE']
+
+    text = [x.text for x in pipeline(unknown_name + " GOT NICE ANTENNAE").sentences[0].words]
+    assert text == [unknown_name[:-2], "'S", 'GOT', 'NICE', 'ANTENNAE']
+
+    text = [x.text for x in pipeline("SHE'S GOT NICE ANTENNAE").sentences[0].words]
+    assert text == ['SHE', "'S", 'GOT', 'NICE', 'ANTENNAE']
+
+    text = [x.text for x in pipeline("She's GOT NICE ANTENNAE").sentences[0].words]
+    assert text == ['She', "'s", 'GOT', 'NICE', 'ANTENNAE']
+