From ad5e3f71423ec6c986d91d0478bbd7544009ad0b Mon Sep 17 00:00:00 2001 From: baicaixiaozhan Date: Sun, 19 Nov 2023 00:27:18 +0800 Subject: [PATCH 1/2] =?UTF-8?q?feat:=20=E6=96=B0=E5=A2=9E=20=E5=8D=83?= =?UTF-8?q?=E5=88=86=E4=BD=8D=E6=A0=BC=E5=BC=8F=E6=95=B0=E5=AD=97=E8=AF=86?= =?UTF-8?q?=E5=88=AB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [skip ci] --- .../impl/ThousandsSeparatorRecognition.java | 92 +++++++++++++++++++ .../ThousandsSeparatorRecognitionTest.java | 85 +++++++++++++++++ 2 files changed, 177 insertions(+) create mode 100644 src/main/java/org/ansj/recognition/impl/ThousandsSeparatorRecognition.java create mode 100644 src/test/java/org/ansj/recognition/impl/ThousandsSeparatorRecognitionTest.java diff --git a/src/main/java/org/ansj/recognition/impl/ThousandsSeparatorRecognition.java b/src/main/java/org/ansj/recognition/impl/ThousandsSeparatorRecognition.java new file mode 100644 index 00000000..793d474c --- /dev/null +++ b/src/main/java/org/ansj/recognition/impl/ThousandsSeparatorRecognition.java @@ -0,0 +1,92 @@ +package org.ansj.recognition.impl; + +import org.ansj.domain.Result; +import org.ansj.domain.Term; +import org.ansj.domain.TermNature; +import org.ansj.domain.TermNatures; +import org.ansj.recognition.Recognition; + +import java.util.*; + +/** + * DESC: 千分位格式数字识别(如:1,234,567.11、1,234,567) + * + * @author baicaixiaozhan + * @since v5.1.6 + */ +public class ThousandsSeparatorRecognition implements Recognition { + + private static final TermNatures THOUSANDS_SEPARATOR_M = new TermNatures(new TermNature("thousands_separator", 1)); + private String separator; + + public ThousandsSeparatorRecognition() { + this.separator = ","; + } + + public ThousandsSeparatorRecognition(String separator) { + this.separator = separator; + } + + public String getSeparator() { + return separator; + } + + public void setSeparator(String separator) { + this.separator = separator; + } + + @Override + public void recognition(Result result) { + List terms = result.getTerms(); + if (terms.isEmpty()) { + return; + } + + for (Term term : terms) { + if (term.getOffe() == -1) { + continue; + } + + if (Objects.equals(term.termNatures(), TermNatures.M_ALB) && isMatchThousands(term.to())) { + // 处理千分位格式数字 + doMerge(term); + term.updateTermNaturesAndNature(THOUSANDS_SEPARATOR_M); + + Term to = term.to(); + while (isMatchThousands(to)) { + doMerge(term); + to = term.to(); + } + } + } + + for (Iterator iterator = terms.iterator(); iterator.hasNext();) { + Term term = iterator.next(); + if (term.getOffe() == -1) { + iterator.remove(); + } + } + } + + private void doMerge(Term term) { + Term to1 = term.to(); + term.merage(to1); + to1.setOffe(-1); + + Term to2 = term.to(); + term.merage(to2); + to2.setOffe(-1); + } + + private boolean isMatchThousands(Term term) { + return Objects.equals(term.getName(), separator) + && ( + (term.from().getName().contains(separator) && term.from().getName().indexOf(separator) <= 3) + || (!term.from().getName().contains(separator) && term.from().getName().length() <= 3) + ) + && Objects.equals(term.to().termNatures(), TermNatures.M_ALB) + && ((term.to().getName().contains(".") && term.to().getName().indexOf(".") == 3) + || (!term.to().getName().contains(".") && term.to().getName().length() == 3)); + } + +} diff --git a/src/test/java/org/ansj/recognition/impl/ThousandsSeparatorRecognitionTest.java b/src/test/java/org/ansj/recognition/impl/ThousandsSeparatorRecognitionTest.java new file mode 100644 index 00000000..f9c1d104 --- /dev/null +++ b/src/test/java/org/ansj/recognition/impl/ThousandsSeparatorRecognitionTest.java @@ -0,0 +1,85 @@ +package org.ansj.recognition.impl; + +import org.ansj.domain.Result; +import org.ansj.splitWord.analysis.ToAnalysis; +import org.junit.Assert; +import org.junit.Test; + + +/** + * DESC: 千分位格式数字识别单元测试 + * + * @author baicaixiaozhan + * @since v1.0.0 + */ +public class ThousandsSeparatorRecognitionTest { + + @Test + public void test_ThousandsSeparatorRecognition_whenThousandsSeparatorExisted() { + final String WITH_THOUSANDS_SEPARATOR_NUMBER_TEXT = "当日访问量为10,234,543 10000.00。是预期结果"; + + Result result = ToAnalysis.parse(WITH_THOUSANDS_SEPARATOR_NUMBER_TEXT).recognition(new ThousandsSeparatorRecognition()); + + Assert.assertEquals("10,234,543", result.get(3).getName()); + Assert.assertEquals("10,234,543/thousands_separator", result.get(3).toString()); + Assert.assertEquals("10000.00", result.get(5).getName()); + Assert.assertEquals("10000.00/m", result.get(5).toString()); + } + + @Test + public void test_ThousandsSeparatorRecognition_whenUseCustomSeparator() { + final String WITH_THOUSANDS_SEPARATOR_NUMBER_TEXT = "10,234,543 102_234_543.00"; + + Result result = ToAnalysis.parse(WITH_THOUSANDS_SEPARATOR_NUMBER_TEXT) + .recognition(new ThousandsSeparatorRecognition(",")) + .recognition(new ThousandsSeparatorRecognition("_")); + + Assert.assertEquals("10,234,543", result.get(0).getName()); + Assert.assertEquals("10,234,543/thousands_separator", result.get(0).toString()); + Assert.assertEquals("102_234_543.00", result.get(2).getName()); + Assert.assertEquals("102_234_543.00/thousands_separator", result.get(2).toString()); + } + + @Test + public void test_ThousandsSeparatorRecognition_whenThousandsSeparatorIsError() { + Result result1 = ToAnalysis.parse("10,234,5430").recognition(new ThousandsSeparatorRecognition()); + Assert.assertEquals("10,234 | , | 5430", result1.toStringWithOutNature(" | ")); + + Result result2 = ToAnalysis.parse("1088,234,5430").recognition(new ThousandsSeparatorRecognition()); + Assert.assertEquals("1088 | , | 234 | , | 5430", result2.toStringWithOutNature(" | ")); + + Result result3 = ToAnalysis.parse("108,234,5430.00").recognition(new ThousandsSeparatorRecognition()); + Assert.assertEquals("108,234 | , | 5430.00", result3.toStringWithOutNature(" | ")); + + Result result4 = ToAnalysis.parse("108,234.00,430.00").recognition(new ThousandsSeparatorRecognition()); + Assert.assertEquals("108,234.00 | , | 430.00", result4.toStringWithOutNature(" | ")); + } + + @Test + public void test_ThousandsSeparatorRecognition_whenThousandsSeparatorInEnd() { + final String WITH_THOUSANDS_SEPARATOR_NUMBER_TEXT = "存在金额:100,234,543.00元"; + + Result result = ToAnalysis.parse(WITH_THOUSANDS_SEPARATOR_NUMBER_TEXT).recognition(new ThousandsSeparatorRecognition()); + + Assert.assertEquals("100,234,543.00元", result.get(3).getName()); + } + + @Test + public void test_ThousandsSeparatorRecognition_whenThousandsSeparatorInStart() { + final String WITH_THOUSANDS_SEPARATOR_NUMBER_TEXT = "100,234,543.00是预期结果"; + + Result result = ToAnalysis.parse(WITH_THOUSANDS_SEPARATOR_NUMBER_TEXT).recognition(new ThousandsSeparatorRecognition()); + + Assert.assertEquals("100,234,543.00", result.get(0).getName()); + } + + @Test + public void test_ThousandsSeparatorRecognition_whenThousandsSeparatorInCenter() { + final String WITH_THOUSANDS_SEPARATOR_NUMBER_TEXT = "当日访问量为10,234,543。是预期结果"; + + Result result = ToAnalysis.parse(WITH_THOUSANDS_SEPARATOR_NUMBER_TEXT).recognition(new ThousandsSeparatorRecognition()); + + Assert.assertEquals("10,234,543", result.get(3).getName()); + } + +} From a940103deacdc83d4b21babfc70d16df7255c3a8 Mon Sep 17 00:00:00 2001 From: baicaixiaozhan Date: Sun, 19 Nov 2023 13:04:43 +0800 Subject: [PATCH 2/2] =?UTF-8?q?docs:=20=E4=BF=AE=E6=94=B9=20=E5=8D=95?= =?UTF-8?q?=E5=85=83=E6=B5=8B=E8=AF=95=E9=94=99=E8=AF=AF=E6=8F=8F=E8=BF=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [skip ci] --- .../recognition/impl/ThousandsSeparatorRecognitionTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/java/org/ansj/recognition/impl/ThousandsSeparatorRecognitionTest.java b/src/test/java/org/ansj/recognition/impl/ThousandsSeparatorRecognitionTest.java index f9c1d104..b81edf79 100644 --- a/src/test/java/org/ansj/recognition/impl/ThousandsSeparatorRecognitionTest.java +++ b/src/test/java/org/ansj/recognition/impl/ThousandsSeparatorRecognitionTest.java @@ -10,7 +10,7 @@ * DESC: 千分位格式数字识别单元测试 * * @author baicaixiaozhan - * @since v1.0.0 + * @since v5.1.6 */ public class ThousandsSeparatorRecognitionTest {