|
1 | 1 | """
|
2 |
| -Test that an unknown English character doesn't result in bizarre splits |
| 2 | +Test a couple English MWT corner cases which might be more widely applicable to other MWT languages |
| 3 | +
|
| 4 | +- unknown English character doesn't result in bizarre splits |
| 5 | +- Casing or CASING doesn't get lost in the dictionary lookup |
3 | 6 |
|
4 | 7 | In the English UD datasets, the MWT are composed exactly of the
|
5 | 8 | subwords, so the MWT model should be chopping up the input text rather
|
6 | 9 | than generating new text.
|
| 10 | +
|
| 11 | +Furthermore, SHE'S and She's should be split "SHE 'S" and "She 's" respectively |
7 | 12 | """
|
8 | 13 |
|
9 | 14 | import pytest
|
@@ -40,3 +45,44 @@ def test_mwt_unknown_char():
|
40 | 45 | assert doc.sentences[0].tokens[3].text == possessive
|
41 | 46 | assert len(doc.sentences[0].tokens[3].words) == 2
|
42 | 47 | assert "".join(x.text for x in doc.sentences[0].tokens[3].words) == possessive
|
| 48 | + |
| 49 | + |
| 50 | +def test_english_mwt_casing(): |
| 51 | + """ |
| 52 | + Test that for a word where the lowercase split is known, the correct casing is still used |
| 53 | +
|
| 54 | + Once upon a time, the logic used in the MWT expander would split |
| 55 | + SHE'S -> she 's |
| 56 | +
|
| 57 | + which is a very surprising tokenization to people expecting |
| 58 | + the original text in the output document |
| 59 | + """ |
| 60 | + pipeline = stanza.Pipeline(processors='tokenize,mwt', dir=TEST_MODELS_DIR, lang='en', download_method=None) |
| 61 | + |
| 62 | + mwt_trainer = pipeline.processors['mwt']._trainer |
| 63 | + for i in range(1, 20): |
| 64 | + # many test cases follow this pattern for some reason, |
| 65 | + # so we should proactively look for a test case which hasn't |
| 66 | + # made its way into the MWT dictionary |
| 67 | + unknown_name = "jennife" + "r" * i + "'s" |
| 68 | + if unknown_name not in mwt_trainer.expansion_dict and unknown_name.upper() not in mwt_trainer.expansion_dict: |
| 69 | + unknown_name = unknown_name.upper() |
| 70 | + break |
| 71 | + else: |
| 72 | + raise AssertionError("Need a new heuristic for the unknown word in the English MWT!") |
| 73 | + |
| 74 | + # this SHOULD show up in the expansion dict |
| 75 | + assert "she's" in mwt_trainer.expansion_dict, "Expected |she's| to be in the English MWT expansion dict... perhaps find a different test case" |
| 76 | + |
| 77 | + text = [x.text for x in pipeline("JENNIFER HAS NICE ANTENNAE").sentences[0].words] |
| 78 | + assert text == ['JENNIFER', 'HAS', 'NICE', 'ANTENNAE'] |
| 79 | + |
| 80 | + text = [x.text for x in pipeline(unknown_name + " GOT NICE ANTENNAE").sentences[0].words] |
| 81 | + assert text == [unknown_name[:-2], "'S", 'GOT', 'NICE', 'ANTENNAE'] |
| 82 | + |
| 83 | + text = [x.text for x in pipeline("SHE'S GOT NICE ANTENNAE").sentences[0].words] |
| 84 | + assert text == ['SHE', "'S", 'GOT', 'NICE', 'ANTENNAE'] |
| 85 | + |
| 86 | + text = [x.text for x in pipeline("She's GOT NICE ANTENNAE").sentences[0].words] |
| 87 | + assert text == ['She', "'s", 'GOT', 'NICE', 'ANTENNAE'] |
| 88 | + |
0 commit comments