fastnlp
diff --git a/‎reproduction/multi-criteria-cws/README.md
+61 b/‎reproduction/multi-criteria-cws/README.md
+61
diff --git a/‎reproduction/multi-criteria-cws/data-prepare.py
+262 b/‎reproduction/multi-criteria-cws/data-prepare.py
+262
@@ -0,0 +1,61 @@
+
+
+# Multi-Criteria-CWS
+
+An implementation of [Multi-Criteria Chinese Word Segmentation with Transformer](http://arxiv.org/abs/1906.12035) with fastNLP.
+
+## Dataset
+### Overview
+We use the same datasets listed in paper.
+- sighan2005
+  - pku
+  - msr
+  - as
+  - cityu
+- sighan2008
+  - ctb
+  - ckip
+  - cityu (combined with data in sighan2005)
+  - ncc
+  - sxu
+
+### Preprocess
+First, download OpenCC to convert between Traditional Chinese and Simplified Chinese.
+``` shell
+pip install opencc-python-reimplemented
+```
+Then, set a path to save processed data, and run the shell script to process the data.
+```shell
+export DATA_DIR=path/to/processed-data
+bash make_data.sh path/to/sighan2005 path/to/sighan2008
+```
+It would take a few minutes to finish the process.
+
+## Model
+We use transformer to build the model, as described in paper.
+
+## Train
+Finally, to train the model, run the shell script.
+The `train.sh` takes one argument, the GPU-IDs to use, for example:
+``` shell
+bash train.sh 0,1
+```
+This command use GPUs with ID 0 and 1.
+
+Note: Please refer to the paper for details of hyper-parameters. And modify the settings in `train.sh` to match your experiment environment.
+
+Type 
+``` shell
+python main.py --help
+```
+to learn all arguments to be specified in training.
+
+## Performance
+
+Results on the test sets of eight CWS datasets with multi-criteria learning. 
+
+| Dataset        | MSRA  | AS    | PKU   | CTB   | CKIP  | CITYU | NCC   | SXU   | Avg.  |
+| -------------- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- |
+| Original paper | 98.05 | 96.44 | 96.41 | 96.99 | 96.51 | 96.91 | 96.04 | 97.61 | 96.87 |
+| Ours           | 96.92 | 95.71 | 95.65 | 95.96 | 96.00 | 96.09 | 94.61 | 96.64 | 95.95 |
+
@@ -0,0 +1,262 @@
+import os
+import re
+import argparse
+from opencc import OpenCC
+
+cc = OpenCC("t2s")
+
+from utils import make_sure_path_exists, append_tags
+
+sighan05_root = ""
+sighan08_root = ""
+data_path = ""
+
+E_pun = u",.!?[]()<>\"\"'',"
+C_pun = u"，。！？【】（）《》“”‘’、"
+Table = {ord(f): ord(t) for f, t in zip(C_pun, E_pun)}
+Table[12288] = 32  # 全半角空格
+
+
+def C_trans_to_E(string):
+    return string.translate(Table)
+
+
+def normalize(ustring):
+    """全角转半角"""
+    rstring = ""
+    for uchar in ustring:
+        inside_code = ord(uchar)
+        if inside_code == 12288:  # 全角空格直接转换
+            inside_code = 32
+        elif 65281 <= inside_code <= 65374:  # 全角字符（除空格）根据关系转化
+            inside_code -= 65248
+
+        rstring += chr(inside_code)
+    return rstring
+
+
+def preprocess(text):
+    rNUM = u"(-|\+)?\d+((\.|·)\d+)?%?"
+    rENG = u"[A-Za-z_]+.*"
+    sent = normalize(C_trans_to_E(text.strip())).split()
+    new_sent = []
+    for word in sent:
+        word = re.sub(u"\s+", "", word, flags=re.U)
+        word = re.sub(rNUM, u"0", word, flags=re.U)
+        word = re.sub(rENG, u"X", word)
+        new_sent.append(word)
+    return new_sent
+
+
+def to_sentence_list(text, split_long_sentence=False):
+    text = preprocess(text)
+    delimiter = set()
+    delimiter.update("。！？：；…、，（）,;!?、,\"'")
+    delimiter.add("……")
+    sent_list = []
+    sent = []
+    sent_len = 0
+    for word in text:
+        sent.append(word)
+        sent_len += len(word)
+        if word in delimiter or (split_long_sentence and sent_len >= 50):
+            sent_list.append(sent)
+            sent = []
+            sent_len = 0
+
+    if len(sent) > 0:
+        sent_list.append(sent)
+
+    return sent_list
+
+
+def is_traditional(dataset):
+    return dataset in ["as", "cityu", "ckip"]
+
+
+def convert_file(
+    src, des, need_cc=False, split_long_sentence=False, encode="utf-8-sig"
+):
+    with open(src, encoding=encode) as src, open(des, "w", encoding="utf-8") as des:
+        for line in src:
+            for sent in to_sentence_list(line, split_long_sentence):
+                line = " ".join(sent) + "\n"
+                if need_cc:
+                    line = cc.convert(line)
+                des.write(line)
+                # if len(''.join(sent)) > 200:
+                #     print(' '.join(sent))
+
+
+def split_train_dev(dataset):
+    root = data_path + "/" + dataset + "/raw/"
+    with open(root + "train-all.txt", encoding="UTF-8") as src, open(
+        root + "train.txt", "w", encoding="UTF-8"
+    ) as train, open(root + "dev.txt", "w", encoding="UTF-8") as dev:
+        lines = src.readlines()
+        idx = int(len(lines) * 0.9)
+        for line in lines[:idx]:
+            train.write(line)
+        for line in lines[idx:]:
+            dev.write(line)
+
+
+def combine_files(one, two, out):
+    if os.path.exists(out):
+        os.remove(out)
+    with open(one, encoding="utf-8") as one, open(two, encoding="utf-8") as two, open(
+        out, "a", encoding="utf-8"
+    ) as out:
+        for line in one:
+            out.write(line)
+        for line in two:
+            out.write(line)
+
+
+def bmes_tag(input_file, output_file):
+    with open(input_file, encoding="utf-8") as input_data, open(
+        output_file, "w", encoding="utf-8"
+    ) as output_data:
+        for line in input_data:
+            word_list = line.strip().split()
+            for word in word_list:
+                if len(word) == 1 or (
+                    len(word) > 2 and word[0] == "<" and word[-1] == ">"
+                ):
+                    output_data.write(word + "\tS\n")
+                else:
+                    output_data.write(word[0] + "\tB\n")
+                    for w in word[1 : len(word) - 1]:
+                        output_data.write(w + "\tM\n")
+                    output_data.write(word[len(word) - 1] + "\tE\n")
+            output_data.write("\n")
+
+
+def make_bmes(dataset="pku"):
+    path = data_path + "/" + dataset + "/"
+    make_sure_path_exists(path + "bmes")
+    bmes_tag(path + "raw/train.txt", path + "bmes/train.txt")
+    bmes_tag(path + "raw/train-all.txt", path + "bmes/train-all.txt")
+    bmes_tag(path + "raw/dev.txt", path + "bmes/dev.txt")
+    bmes_tag(path + "raw/test.txt", path + "bmes/test.txt")
+
+
+def convert_sighan2005_dataset(dataset):
+    global sighan05_root
+    root = os.path.join(data_path, dataset)
+    make_sure_path_exists(root)
+    make_sure_path_exists(root + "/raw")
+    file_path = "{}/{}_training.utf8".format(sighan05_root, dataset)
+    convert_file(
+        file_path, "{}/raw/train-all.txt".format(root), is_traditional(dataset), True
+    )
+    if dataset == "as":
+        file_path = "{}/{}_testing_gold.utf8".format(sighan05_root, dataset)
+    else:
+        file_path = "{}/{}_test_gold.utf8".format(sighan05_root, dataset)
+    convert_file(
+        file_path, "{}/raw/test.txt".format(root), is_traditional(dataset), False
+    )
+    split_train_dev(dataset)
+
+
+def convert_sighan2008_dataset(dataset, utf=16):
+    global sighan08_root
+    root = os.path.join(data_path, dataset)
+    make_sure_path_exists(root)
+    make_sure_path_exists(root + "/raw")
+    convert_file(
+        "{}/{}_train_utf{}.seg".format(sighan08_root, dataset, utf),
+        "{}/raw/train-all.txt".format(root),
+        is_traditional(dataset),
+        True,
+        "utf-{}".format(utf),
+    )
+    convert_file(
+        "{}/{}_seg_truth&resource/{}_truth_utf{}.seg".format(
+            sighan08_root, dataset, dataset, utf
+        ),
+        "{}/raw/test.txt".format(root),
+        is_traditional(dataset),
+        False,
+        "utf-{}".format(utf),
+    )
+    split_train_dev(dataset)
+
+
+def extract_conll(src, out):
+    words = []
+    with open(src, encoding="utf-8") as src, open(out, "w", encoding="utf-8") as out:
+        for line in src:
+            line = line.strip()
+            if len(line) == 0:
+                out.write(" ".join(words) + "\n")
+                words = []
+                continue
+            cells = line.split()
+            words.append(cells[1])
+
+
+def make_joint_corpus(datasets, joint):
+    parts = ["dev", "test", "train", "train-all"]
+    for part in parts:
+        old_file = "{}/{}/raw/{}.txt".format(data_path, joint, part)
+        if os.path.exists(old_file):
+            os.remove(old_file)
+        elif not os.path.exists(os.path.dirname(old_file)):
+            os.makedirs(os.path.dirname(old_file))
+        for name in datasets:
+            append_tags(
+                os.path.join(data_path, name, "raw"),
+                os.path.dirname(old_file),
+                name,
+                part,
+                encode="utf-8",
+            )
+
+
+def convert_all_sighan2005(datasets):
+    for dataset in datasets:
+        print(("Converting sighan bakeoff 2005 corpus: {}".format(dataset)))
+        convert_sighan2005_dataset(dataset)
+        make_bmes(dataset)
+
+
+def convert_all_sighan2008(datasets):
+    for dataset in datasets:
+        print(("Converting sighan bakeoff 2008 corpus: {}".format(dataset)))
+        convert_sighan2008_dataset(dataset, 16)
+        make_bmes(dataset)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # fmt: off
+    parser.add_argument("--sighan05", required=True, type=str, help="path to sighan2005 dataset")
+    parser.add_argument("--sighan08", required=True, type=str, help="path to sighan2008 dataset")
+    parser.add_argument("--data_path", required=True, type=str, help="path to save dataset")
+    # fmt: on
+
+    args, _ = parser.parse_known_args()
+    sighan05_root = args.sighan05
+    sighan08_root = args.sighan08
+    data_path = args.data_path
+
+    print("Converting sighan2005 Simplified Chinese corpus")
+    datasets = "pku", "msr", "as", "cityu"
+    convert_all_sighan2005(datasets)
+
+    print("Combining sighan2005 corpus to one joint Simplified Chinese corpus")
+    datasets = "pku", "msr", "as", "cityu"
+    make_joint_corpus(datasets, "joint-sighan2005")
+    make_bmes("joint-sighan2005")
+
+    # For researchers who have access to sighan2008 corpus, use official corpora please.
+    print("Converting sighan2008 Simplified Chinese corpus")
+    datasets = "ctb", "ckip", "cityu", "ncc", "sxu"
+    convert_all_sighan2008(datasets)
+    print("Combining those 8 sighan corpora to one joint corpus")
+    datasets = "pku", "msr", "as", "ctb", "ckip", "cityu", "ncc", "sxu"
+    make_joint_corpus(datasets, "joint-sighan2008")
+    make_bmes("joint-sighan2008")
+