init
Browse files- README.md +596 -3
- config.json +30 -0
- csc.config +81 -0
- pytorch_model.bin +3 -0
- special_tokens_map.json +7 -0
- tokenizer.json +0 -0
- tokenizer_config.json +13 -0
- vocab.txt +0 -0
README.md
CHANGED
|
@@ -1,3 +1,596 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<p align="center">
|
| 2 |
+
<img src="tet/images/csc_logo.png" width="480">
|
| 3 |
+
</p>
|
| 4 |
+
|
| 5 |
+
# [macro-correct](https://github.com/yongzhuo/macro-correct)
|
| 6 |
+
[](https://pypi.org/project/macro-correct/)
|
| 7 |
+
[](https://travis-ci.com/yongzhuo/macro-correct)
|
| 8 |
+
[](https://pypi.org/project/macro-correct/)
|
| 9 |
+
[](https://github.com/yongzhuo/macro-correct/stargazers)
|
| 10 |
+
[](https://github.com/yongzhuo/macro-correct/network/members)
|
| 11 |
+
[](https://gitter.im/yongzhuo/macro-correct?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
|
| 12 |
+
>>> macro-correct, 文本纠错工具包(Text Correct), 支持中文拼写纠错/标点符号纠错(CSC, Chinese Spelling Correct / Check), CSC支持各领域数据(包括古文), 模型在大规模、各领域的、现代/当代语料上训练而得, 泛化性强.
|
| 13 |
+
|
| 14 |
+
>>> macro-correct是一个只依赖pytorch、transformers、numpy、opencc的文本纠错(CSC, 中文拼写纠错; Punct, 中文标点纠错)工具包,专注于中文文本纠错的极简自然语言处理工具包。
|
| 15 |
+
使用大部分市面上的开源数据集构建生成的混淆集,使用人民日报语料&学习强国语料等生成1000万+训练数据集来训练模型;
|
| 16 |
+
支持MDCSpell、Macbert、ReLM、SoftBERT、BertCRF等多种经典模型;
|
| 17 |
+
支持中文拼写纠错、中文标点符号纠错、中文语法纠错(待续)、独立的检测模型/识别模型(待续);
|
| 18 |
+
具有依赖轻量、代码简洁、注释详细、调试清晰、配置灵活、拓展方便、适配NLP等特性。
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
## 目录
|
| 22 |
+
* [安装](#安装)
|
| 23 |
+
* [调用](#调用)
|
| 24 |
+
* [体验](#体验)
|
| 25 |
+
* [词典](#词典)
|
| 26 |
+
* [详情](#详情)
|
| 27 |
+
* [训练](#训练)
|
| 28 |
+
* [测评](#测评)
|
| 29 |
+
* [日志](#日志)
|
| 30 |
+
* [参考](#参考)
|
| 31 |
+
* [论文](#论文)
|
| 32 |
+
* [Cite](#Cite)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
# 安装
|
| 36 |
+
```bash
|
| 37 |
+
pip install macro-correct
|
| 38 |
+
|
| 39 |
+
# 清华镜像源
|
| 40 |
+
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple macro-correct
|
| 41 |
+
|
| 42 |
+
# 如果不行, 则不带依赖安装, 之后缺什么包再补充什么
|
| 43 |
+
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple macro-correct --no-dependencies
|
| 44 |
+
```
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
# 调用
|
| 48 |
+
更多样例sample详情见/tet目录
|
| 49 |
+
- 使用example详见/tet/tet目录, 中文拼写纠错代码为tet_csc_token_zh.py, 中文标点符号纠错代码为tet_csc_punct_zh.py, CSC也可以直接用tet_csc_flag_transformers.py
|
| 50 |
+
- 训练代码详见/tet/train目录, 可配置本地预训练模型地址和各种参数等;
|
| 51 |
+
|
| 52 |
+
# 体验
|
| 53 |
+
[HF---Space---Macropodus/macbert4csc_v2](https://huggingface.co/spaces/Macropodus/macbert4csc_v2)
|
| 54 |
+
|
| 55 |
+
<img src="tet/images/csc_demo.png" width="1024">
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
## 2.调用-文本纠错
|
| 59 |
+
### 2.1 CSC 使用 macro-bert
|
| 60 |
+
```python
|
| 61 |
+
# !/usr/bin/python
|
| 62 |
+
# -*- coding: utf-8 -*-
|
| 63 |
+
# @time : 2021/2/29 21:41
|
| 64 |
+
# @author : Mo
|
| 65 |
+
# @function: 文本纠错, 使用macro-correct
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
import os
|
| 69 |
+
os.environ["MACRO_CORRECT_FLAG_CSC_TOKEN"] = "1"
|
| 70 |
+
from macro_correct import correct
|
| 71 |
+
### 默认纠错(list输入)
|
| 72 |
+
text_list = ["真麻烦你了。希望你们好好的跳无",
|
| 73 |
+
"少先队员因该为老人让坐",
|
| 74 |
+
"机七学习是人工智能领遇最能体现智能的一个分知",
|
| 75 |
+
"一只小鱼船浮在平净的河面上"
|
| 76 |
+
]
|
| 77 |
+
text_csc = correct(text_list)
|
| 78 |
+
print("默认纠错(list输入):")
|
| 79 |
+
for res_i in text_csc:
|
| 80 |
+
print(res_i)
|
| 81 |
+
print("#" * 128)
|
| 82 |
+
|
| 83 |
+
"""
|
| 84 |
+
默认纠错(list输入):
|
| 85 |
+
{'index': 0, 'source': '真麻烦你了。希望你们好好的跳无', 'target': '真麻烦你了。希望你们好好地跳舞', 'errors': [['的', '地', 12, 0.6584], ['无', '舞', 14, 1.0]]}
|
| 86 |
+
{'index': 1, 'source': '少先队员因该为老人让坐', 'target': '少先队员应该为老人让坐', 'errors': [['因', '应', 4, 0.995]]}
|
| 87 |
+
{'index': 2, 'source': '机七学习是人工智能领遇最能体现智能的一个分知', 'target': '机器学习是人工智能领域最能体现智能的一个分支', 'errors': [['七', '器', 1, 0.9998], ['遇', '域', 10, 0.9999], ['知', '支', 21, 1.0]]}
|
| 88 |
+
{'index': 3, 'source': '一只小鱼船浮在平净的河面上', 'target': '一只小鱼船浮在平静的河面上', 'errors': [['净', '静', 8, 0.9961]]}
|
| 89 |
+
"""
|
| 90 |
+
```
|
| 91 |
+
|
| 92 |
+
### 2.2 CSC 使用 transformers
|
| 93 |
+
```bash
|
| 94 |
+
# !/usr/bin/python
|
| 95 |
+
# -*- coding: utf-8 -*-
|
| 96 |
+
# @time : 2021/2/29 21:41
|
| 97 |
+
# @author : Mo
|
| 98 |
+
# @function: transformers直接加载bert类模型测试
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
import traceback
|
| 102 |
+
import time
|
| 103 |
+
import sys
|
| 104 |
+
import os
|
| 105 |
+
os.environ["USE_TORCH"] = "1"
|
| 106 |
+
from transformers import BertConfig, BertTokenizer, BertForMaskedLM
|
| 107 |
+
import torch
|
| 108 |
+
|
| 109 |
+
# pretrained_model_name_or_path = "shibing624/macbert4csc-base-chinese"
|
| 110 |
+
pretrained_model_name_or_path = "Macropodus/macbert4mdcspell_v2"
|
| 111 |
+
# pretrained_model_name_or_path = "Macropodus/macbert4mdcspell_v1"
|
| 112 |
+
# pretrained_model_name_or_path = "Macropodus/macbert4csc_v1"
|
| 113 |
+
# pretrained_model_name_or_path = "Macropodus/macbert4csc_v2"
|
| 114 |
+
# pretrained_model_name_or_path = "Macropodus/bert4csc_v1"
|
| 115 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 116 |
+
max_len = 128
|
| 117 |
+
|
| 118 |
+
print("load model, please wait a few minute!")
|
| 119 |
+
tokenizer = BertTokenizer.from_pretrained(pretrained_model_name_or_path)
|
| 120 |
+
bert_config = BertConfig.from_pretrained(pretrained_model_name_or_path)
|
| 121 |
+
model = BertForMaskedLM.from_pretrained(pretrained_model_name_or_path)
|
| 122 |
+
model.to(device)
|
| 123 |
+
print("load model success!")
|
| 124 |
+
|
| 125 |
+
texts = [
|
| 126 |
+
"机七学习是人工智能领遇最能体现智能的一个分知",
|
| 127 |
+
"我是练习时长两念半的鸽仁练习生蔡徐坤",
|
| 128 |
+
"真麻烦你了。希望你们好好的跳无",
|
| 129 |
+
"他法语说的很好,的语也不错",
|
| 130 |
+
"遇到一位很棒的奴生跟我疗天",
|
| 131 |
+
"我们为这个目标努力不解",
|
| 132 |
+
]
|
| 133 |
+
len_mid = min(max_len, max([len(t)+2 for t in texts]))
|
| 134 |
+
|
| 135 |
+
with torch.no_grad():
|
| 136 |
+
outputs = model(**tokenizer(texts, padding=True, max_length=len_mid,
|
| 137 |
+
return_tensors="pt").to(device))
|
| 138 |
+
|
| 139 |
+
def get_errors(source, target):
|
| 140 |
+
""" 极简方法获取 errors """
|
| 141 |
+
len_min = min(len(source), len(target))
|
| 142 |
+
errors = []
|
| 143 |
+
for idx in range(len_min):
|
| 144 |
+
if source[idx] != target[idx]:
|
| 145 |
+
errors.append([source[idx], target[idx], idx])
|
| 146 |
+
return errors
|
| 147 |
+
|
| 148 |
+
result = []
|
| 149 |
+
for probs, source in zip(outputs.logits, texts):
|
| 150 |
+
ids = torch.argmax(probs, dim=-1)
|
| 151 |
+
tokens_space = tokenizer.decode(ids[1:-1], skip_special_tokens=False)
|
| 152 |
+
text_new = tokens_space.replace(" ", "")
|
| 153 |
+
target = text_new[:len(source)]
|
| 154 |
+
errors = get_errors(source, target)
|
| 155 |
+
print(source, " => ", target, errors)
|
| 156 |
+
result.append([target, errors])
|
| 157 |
+
print(result)
|
| 158 |
+
"""
|
| 159 |
+
机七学习是人工智能领遇最能体现智能的一个分知 => 机器学习是人工智能领域最能体现智能的一个分支 [['七', '器', 1], ['遇', '域', 10], ['知', '支', 21]]
|
| 160 |
+
我是练习时长两念半的鸽仁练习生蔡徐坤 => 我是练习时长两年半的个人练习生蔡徐坤 [['念', '年', 7], ['鸽', '个', 10], ['仁', '人', 11]]
|
| 161 |
+
真麻烦你了。希望你们好好的跳无 => 真麻烦你了。希望你们好好地跳舞 [['的', '地', 12], ['无', '舞', 14]]
|
| 162 |
+
他法语说的很好,的语也不错 => 他法语说得很好,德语也不错 [['的', '得', 4], ['的', '德', 8]]
|
| 163 |
+
遇到一位很棒的奴生跟我疗天 => 遇到一位很棒的女生跟我聊天 [['奴', '女', 7], ['疗', '聊', 11]]
|
| 164 |
+
我们为这个目标努力不解 => 我们为这个目标努力不懈 [['解', '懈', 10]]
|
| 165 |
+
"""
|
| 166 |
+
```
|
| 167 |
+
|
| 168 |
+
## 3.调用-标点纠错
|
| 169 |
+
```python
|
| 170 |
+
import os
|
| 171 |
+
os.environ["MACRO_CORRECT_FLAG_CSC_PUNCT"] = "1"
|
| 172 |
+
from macro_correct import correct_punct
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
### 1.默认标点纠错(list输入)
|
| 176 |
+
text_list = ["山不在高有仙则名。",
|
| 177 |
+
"水不在深,有龙则灵",
|
| 178 |
+
"斯是陋室惟吾德馨",
|
| 179 |
+
"苔痕上阶绿草,色入帘青。"
|
| 180 |
+
]
|
| 181 |
+
text_csc = correct_punct(text_list)
|
| 182 |
+
print("默认标点纠错(list输入):")
|
| 183 |
+
for res_i in text_csc:
|
| 184 |
+
print(res_i)
|
| 185 |
+
print("#" * 128)
|
| 186 |
+
|
| 187 |
+
"""
|
| 188 |
+
默认标点纠错(list输入):
|
| 189 |
+
{'index': 0, 'source': '山不在高有仙则名。', 'target': '山不在高,有仙则名。', 'score': 0.9917, 'errors': [['', ',', 4, 0.9917]]}
|
| 190 |
+
{'index': 1, 'source': '水不在深,有龙则灵', 'target': '水不在深,有龙则灵。', 'score': 0.9995, 'errors': [['', '。', 9, 0.9995]]}
|
| 191 |
+
{'index': 2, 'source': '斯是陋室惟吾德馨', 'target': '斯是陋室,惟吾德馨。', 'score': 0.9999, 'errors': [['', ',', 4, 0.9999], ['', '。', 8, 0.9998]]}
|
| 192 |
+
{'index': 3, 'source': '苔痕上阶绿草,色入帘青。', 'target': '苔痕上阶绿,草色入帘青。', 'score': 0.9998, 'errors': [['', ',', 5, 0.9998]]}
|
| 193 |
+
"""
|
| 194 |
+
```
|
| 195 |
+
|
| 196 |
+
# 词典
|
| 197 |
+
## 默认混淆词典地址
|
| 198 |
+
* macro_correct/output/confusion_dict.json
|
| 199 |
+
## 操作混淆词典
|
| 200 |
+
```python
|
| 201 |
+
## 自定义混淆词典
|
| 202 |
+
# !/usr/bin/python
|
| 203 |
+
# -*- coding: utf-8 -*-
|
| 204 |
+
# @time : 2021/2/29 21:41
|
| 205 |
+
# @author : Mo
|
| 206 |
+
# @function: tet csc of token confusion dict, 混淆词典
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
import os
|
| 210 |
+
os.environ["MACRO_CORRECT_FLAG_CSC_TOKEN"] = "1"
|
| 211 |
+
|
| 212 |
+
from macro_correct.pytorch_textcorrection.tcTrie import ConfusionCorrect
|
| 213 |
+
from macro_correct import MODEL_CSC_TOKEN
|
| 214 |
+
from macro_correct import correct
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
### 默认使用混淆词典
|
| 218 |
+
user_dict = {
|
| 219 |
+
"乐而往返": "乐而忘返",
|
| 220 |
+
"金钢钻": "金刚钻",
|
| 221 |
+
"藤罗蔓": "藤萝蔓",
|
| 222 |
+
}
|
| 223 |
+
text_list = [
|
| 224 |
+
"为什么乐而往返?",
|
| 225 |
+
"没有金钢钻就不揽瓷活!",
|
| 226 |
+
"你喜欢藤罗蔓吗?",
|
| 227 |
+
"三周年祭日在哪举行?"
|
| 228 |
+
]
|
| 229 |
+
text_csc = correct(text_list, flag_confusion=False)
|
| 230 |
+
print("默认纠错(不带混淆词典):")
|
| 231 |
+
for res_i in text_csc:
|
| 232 |
+
print(res_i)
|
| 233 |
+
print("#" * 128)
|
| 234 |
+
|
| 235 |
+
|
| 236 |
+
|
| 237 |
+
text_csc = correct(text_list, flag_confusion=True)
|
| 238 |
+
print("默认纠错(-带混淆词典-默认):")
|
| 239 |
+
for res_i in text_csc:
|
| 240 |
+
print(res_i)
|
| 241 |
+
print("#" * 128)
|
| 242 |
+
|
| 243 |
+
|
| 244 |
+
# ---混淆词典---
|
| 245 |
+
### 只新增, 新增用户词典(默认混淆词典也使用)
|
| 246 |
+
MODEL_CSC_TOKEN.model_csc.model_confusion = ConfusionCorrect(user_dict=user_dict)
|
| 247 |
+
text_csc = correct(text_list, flag_confusion=True)
|
| 248 |
+
print("默认纠错(-带混淆词典-新增):")
|
| 249 |
+
for res_i in text_csc:
|
| 250 |
+
print(res_i)
|
| 251 |
+
print("#" * 128)
|
| 252 |
+
### 全覆盖, 只使用用户词典(默认混淆词典废弃)
|
| 253 |
+
MODEL_CSC_TOKEN.model_csc.model_confusion = ConfusionCorrect(confusion_dict=user_dict)
|
| 254 |
+
text_csc = correct(text_list, flag_confusion=True)
|
| 255 |
+
print("默认纠错(-带混淆词典-全覆盖):")
|
| 256 |
+
for res_i in text_csc:
|
| 257 |
+
print(res_i)
|
| 258 |
+
print("#" * 128)
|
| 259 |
+
|
| 260 |
+
|
| 261 |
+
# ---混淆词典文件---
|
| 262 |
+
### 只新增, 新增用户词典(默认混淆词典也使用), path不为空即可; json文件, {混淆词语:正确词语} key-value; 详见macro-correct/tet/tet/tet_csc_token_confusion.py
|
| 263 |
+
path_user = "./user_confusion_dict.json"
|
| 264 |
+
MODEL_CSC_TOKEN.model_csc.model_confusion = ConfusionCorrect(path="1", path_user=path_user)
|
| 265 |
+
text_csc = correct(text_list, flag_confusion=True)
|
| 266 |
+
print("默认纠错(-带混淆词典文件-新增):")
|
| 267 |
+
for res_i in text_csc:
|
| 268 |
+
print(res_i)
|
| 269 |
+
print("#" * 128)
|
| 270 |
+
### 全覆盖, 只使用用户词典(默认混淆词典废弃); path必须传空字符串
|
| 271 |
+
MODEL_CSC_TOKEN.model_csc.model_confusion = ConfusionCorrect(path="", path_user=path_user)
|
| 272 |
+
text_csc = correct(text_list, flag_confusion=True)
|
| 273 |
+
print("默认纠错(-带混淆词典文件-全覆盖):")
|
| 274 |
+
for res_i in text_csc:
|
| 275 |
+
print(res_i)
|
| 276 |
+
print("#" * 128)
|
| 277 |
+
|
| 278 |
+
"""
|
| 279 |
+
默认纠错(不带混淆词典):
|
| 280 |
+
{'index': 0, 'source': '为什么乐而往返?', 'target': '为什么乐而往返?', 'errors': []}
|
| 281 |
+
{'index': 1, 'source': '没有金钢钻就不揽瓷活!', 'target': '没有金刚钻就不揽瓷活!', 'errors': [['钢', '刚', 3, 0.6587]]}
|
| 282 |
+
{'index': 2, 'source': '你喜欢藤罗蔓吗?', 'target': '你喜欢藤萝蔓吗?', 'errors': [['罗', '萝', 4, 0.8582]]}
|
| 283 |
+
{'index': 3, 'source': '三周年祭日在哪举行?', 'target': '三周年祭日在哪举行?', 'errors': []}
|
| 284 |
+
################################################################################################################################
|
| 285 |
+
默认纠错(-带混淆词典-默认):
|
| 286 |
+
{'index': 0, 'source': '为什么乐而往返?', 'target': '为什么乐而往返?', 'errors': []}
|
| 287 |
+
{'index': 1, 'source': '没有金钢钻就不揽瓷活!', 'target': '没有金刚钻就不揽瓷活!', 'errors': [['钢', '刚', 3, 1.0]]}
|
| 288 |
+
{'index': 2, 'source': '你喜欢藤罗蔓吗?', 'target': '你喜欢藤萝蔓吗?', 'errors': [['罗', '萝', 4, 0.8582]]}
|
| 289 |
+
{'index': 3, 'source': '三周年祭日在哪举行?', 'target': '三周年忌日在哪举行?', 'errors': [['祭', '忌', 3, 1.0]]}
|
| 290 |
+
################################################################################################################################
|
| 291 |
+
默认纠错(-带混淆词典-新增):
|
| 292 |
+
{'index': 0, 'source': '为什么乐而往返?', 'target': '为什么乐而忘返?', 'errors': [['往', '忘', 5, 1.0]]}
|
| 293 |
+
{'index': 1, 'source': '没有金钢钻就不揽瓷活!', 'target': '没有金刚钻就不揽瓷活!', 'errors': [['钢', '刚', 3, 1.0]]}
|
| 294 |
+
{'index': 2, 'source': '你喜欢藤罗蔓吗?', 'target': '你喜欢藤萝蔓吗?', 'errors': [['罗', '萝', 4, 1.0]]}
|
| 295 |
+
{'index': 3, 'source': '三周年祭日在哪举行?', 'target': '三周年忌日在哪举行?', 'errors': [['祭', '忌', 3, 1.0]]}
|
| 296 |
+
################################################################################################################################
|
| 297 |
+
默认纠错(-带混淆词典-全覆盖):
|
| 298 |
+
{'index': 0, 'source': '为什么乐而往返?', 'target': '为什么乐而忘返?', 'errors': [['往', '忘', 5, 1.0]]}
|
| 299 |
+
{'index': 1, 'source': '没有金钢钻就不揽瓷活!', 'target': '没有金刚钻就不揽瓷活!', 'errors': [['钢', '刚', 3, 1.0]]}
|
| 300 |
+
{'index': 2, 'source': '你喜欢藤罗蔓吗?', 'target': '你喜欢藤萝蔓吗?', 'errors': [['罗', '萝', 4, 1.0]]}
|
| 301 |
+
{'index': 3, 'source': '三周年祭日在哪举行?', 'target': '三周年祭日在哪举行?', 'errors': []}
|
| 302 |
+
################################################################################################################################
|
| 303 |
+
默认纠错(-带混淆词典文件-新增):
|
| 304 |
+
{'index': 0, 'source': '为什么乐而往返?', 'target': '为什么乐而忘返?', 'errors': [['往', '忘', 5, 1.0]]}
|
| 305 |
+
{'index': 1, 'source': '没有金钢钻就不揽瓷活!', 'target': '没有金刚钻就不揽瓷活!', 'errors': [['钢', '刚', 3, 1.0]]}
|
| 306 |
+
{'index': 2, 'source': '你喜欢藤罗蔓吗?', 'target': '你喜欢藤萝蔓吗?', 'errors': [['罗', '萝', 4, 1.0]]}
|
| 307 |
+
{'index': 3, 'source': '三周年祭��在哪举行?', 'target': '三周年忌日在哪举行?', 'errors': [['祭', '忌', 3, 1.0]]}
|
| 308 |
+
################################################################################################################################
|
| 309 |
+
默认纠错(-带混淆词典文件-全覆盖):
|
| 310 |
+
{'index': 0, 'source': '为什么乐而往返?', 'target': '为什么乐而忘返?', 'errors': [['往', '忘', 5, 1.0]]}
|
| 311 |
+
{'index': 1, 'source': '没有金钢钻就不揽瓷活!', 'target': '没有金刚钻就不揽瓷活!', 'errors': [['钢', '刚', 3, 1.0]]}
|
| 312 |
+
{'index': 2, 'source': '你喜欢藤罗蔓吗?', 'target': '你喜欢藤萝蔓吗?', 'errors': [['罗', '萝', 4, 1.0]]}
|
| 313 |
+
{'index': 3, 'source': '三周年祭日在哪举行?', 'target': '三周年祭日在哪举行?', 'errors': []}
|
| 314 |
+
################################################################################################################################
|
| 315 |
+
"""
|
| 316 |
+
```
|
| 317 |
+
|
| 318 |
+
|
| 319 |
+
# 详情
|
| 320 |
+
## CSC调用(超参数说明)
|
| 321 |
+
```python
|
| 322 |
+
import os
|
| 323 |
+
os.environ["MACRO_CORRECT_FLAG_CSC_TOKEN"] = "1"
|
| 324 |
+
from macro_correct import correct
|
| 325 |
+
### 默认纠错(list输入)
|
| 326 |
+
text_list = ["真麻烦你了。希望你们好好的跳无",
|
| 327 |
+
"少先队员因该为老人让坐",
|
| 328 |
+
"机七学习是人工智能领遇最能体现智能的一个分知",
|
| 329 |
+
"一只小鱼船浮在平净的河面上"
|
| 330 |
+
]
|
| 331 |
+
### 默认纠错(list输入, 参数配置)
|
| 332 |
+
params = {
|
| 333 |
+
"threshold": 0.55, # token阈值过滤
|
| 334 |
+
"batch_size": 32, # 批大小
|
| 335 |
+
"max_len": 128, # 自定义的长度, 如果截断了, 则截断部分不参与纠错, 后续直接一模一样的补回来
|
| 336 |
+
"rounded": 4, # 保存4位小数
|
| 337 |
+
"flag_confusion": True, # 是否使用默认的混淆词典
|
| 338 |
+
"flag_prob": True, # 是否返回纠错token处的概率
|
| 339 |
+
}
|
| 340 |
+
text_csc = correct(text_list, **params)
|
| 341 |
+
print("默认纠错(list输入, 参数配置):")
|
| 342 |
+
for res_i in text_csc:
|
| 343 |
+
print(res_i)
|
| 344 |
+
print("#" * 128)
|
| 345 |
+
|
| 346 |
+
|
| 347 |
+
"""
|
| 348 |
+
默认纠错(list输入):
|
| 349 |
+
{'index': 0, 'source': '真麻烦你了。希望你们好好的跳无', 'target': '真麻烦你了。希望你们好好地跳舞', 'errors': [['的', '地', 12, 0.6584], ['无', '舞', 14, 1.0]]}
|
| 350 |
+
{'index': 1, 'source': '少先队员因该为老人让坐', 'target': '少先队员应该为老人让坐', 'errors': [['因', '应', 4, 0.995]]}
|
| 351 |
+
{'index': 2, 'source': '机七学习是人工智能领遇最能体现智能的一个分知', 'target': '机器学习是人工智能领域最能体现智能的一个分支', 'errors': [['七', '器', 1, 0.9998], ['遇', '域', 10, 0.9999], ['知', '支', 21, 1.0]]}
|
| 352 |
+
{'index': 3, 'source': '一只小鱼船浮在平净的河面上', 'target': '一只小鱼船浮在平静的河面上', 'errors': [['净', '静', 8, 0.9961]]}
|
| 353 |
+
"""
|
| 354 |
+
```
|
| 355 |
+
## PUNCT调用(超参数说明)
|
| 356 |
+
```python
|
| 357 |
+
import os
|
| 358 |
+
os.environ["MACRO_CORRECT_FLAG_CSC_PUNCT"] = "1"
|
| 359 |
+
from macro_correct import correct_punct
|
| 360 |
+
|
| 361 |
+
|
| 362 |
+
### 1.默认标点纠错(list输入)
|
| 363 |
+
text_list = ["山不在高有仙则名。",
|
| 364 |
+
"水不在深,有龙则灵",
|
| 365 |
+
"斯是陋室惟吾德馨",
|
| 366 |
+
"苔痕上阶绿草,色入帘青。"
|
| 367 |
+
]
|
| 368 |
+
### 2.默认标点纠错(list输入, 参数配置详情)
|
| 369 |
+
params = {
|
| 370 |
+
"limit_num_errors": 4, # 一句话最多的错别字, 多的就剔除
|
| 371 |
+
"limit_len_char": 4, # 一句话的最小字符数
|
| 372 |
+
"threshold_zh": 0.5, # 句子阈值, 中文字符占比的最低值
|
| 373 |
+
"threshold": 0.55, # token阈值过滤
|
| 374 |
+
"batch_size": 32, # 批大小
|
| 375 |
+
"max_len": 128, # 自定义的长度, 如果截断了, 则截断部分不参与纠错, 后续直接一模一样的补回来
|
| 376 |
+
"rounded": 4, # 保存4位小数
|
| 377 |
+
"flag_prob": True, # 是否返回纠错token处的概率
|
| 378 |
+
}
|
| 379 |
+
text_csc = correct_punct(text_list, **params)
|
| 380 |
+
print("默认标点纠错(list输入):")
|
| 381 |
+
for res_i in text_csc:
|
| 382 |
+
print(res_i)
|
| 383 |
+
print("#" * 128)
|
| 384 |
+
|
| 385 |
+
"""
|
| 386 |
+
默认标点纠错(list输入):
|
| 387 |
+
{'index': 0, 'source': '山不在高有仙则名。', 'target': '山不在高,有仙则名。', 'score': 0.9917, 'errors': [['', ',', 4, 0.9917]]}
|
| 388 |
+
{'index': 1, 'source': '水不在深,有龙则灵', 'target': '水不在深,有龙则灵。', 'score': 0.9995, 'errors': [['', '。', 9, 0.9995]]}
|
| 389 |
+
{'index': 2, 'source': '斯是陋室惟吾德馨', 'target': '斯是陋室,惟吾德馨。', 'score': 0.9999, 'errors': [['', ',', 4, 0.9999], ['', '。', 8, 0.9998]]}
|
| 390 |
+
{'index': 3, 'source': '苔痕上阶绿草,色入帘青。', 'target': '苔痕上阶绿,草色入帘青。', 'score': 0.9998, 'errors': [['', ',', 5, 0.9998]]}
|
| 391 |
+
"""
|
| 392 |
+
```
|
| 393 |
+
|
| 394 |
+
# 训练
|
| 395 |
+
## CSC任务
|
| 396 |
+
### 目录地址
|
| 397 |
+
* macbert4mdcspell: macro_correct/pytorch_user_models/csc/macbert4mdcspell/train_yield.py
|
| 398 |
+
* macbert4csc: macro_correct/pytorch_user_models/csc/macbert4csc/train_yield.py
|
| 399 |
+
* relm: macro_correct/pytorch_user_models/csc/relm/train_yield.py
|
| 400 |
+
### 数据准备
|
| 401 |
+
* espell: list<dict>的json文件结构, 带"original_text"和"correct_text"就��, 参考macro_correct/corpus/text_correction/espell
|
| 402 |
+
```
|
| 403 |
+
[
|
| 404 |
+
{
|
| 405 |
+
"original_text": "遇到逆竟时,我们必须勇于面对,而且要愈挫愈勇,这样我们才能朝著成功之路前进。",
|
| 406 |
+
"correct_text": "遇到逆境时,我们必须勇于面对,而且要愈挫愈勇,这样我们才能朝著成功之路前进。",
|
| 407 |
+
}
|
| 408 |
+
]
|
| 409 |
+
```
|
| 410 |
+
* sighan: list<dict>的json文件结构, 带"source"和"target"就好, 参考macro_correct/corpus/text_correction/sighan
|
| 411 |
+
```
|
| 412 |
+
[
|
| 413 |
+
{
|
| 414 |
+
"source": "若被告人正在劳动教养,则可以通过劳动教养单位转交",
|
| 415 |
+
"target": "若被告人正在劳动教养,则可以通过劳动教养单位转交",
|
| 416 |
+
}
|
| 417 |
+
]
|
| 418 |
+
```
|
| 419 |
+
### 配置-训练-验证-预测
|
| 420 |
+
#### 配置
|
| 421 |
+
配置好数据地址和超参, 参考macro_correct/pytorch_user_models/csc/macbert4mdcspell/config.py
|
| 422 |
+
#### 训练-验证-预测
|
| 423 |
+
```
|
| 424 |
+
训练
|
| 425 |
+
nohup python train_yield.py > tc.train_yield.py.log 2>&1 &
|
| 426 |
+
tail -n 1000 -f tc.train_yield.py.log
|
| 427 |
+
验证
|
| 428 |
+
python eval_std.py
|
| 429 |
+
预测
|
| 430 |
+
python predict.py
|
| 431 |
+
```
|
| 432 |
+
|
| 433 |
+
## PUNCT任务
|
| 434 |
+
### 目录地址
|
| 435 |
+
* PUNCT: macro_correct/pytorch_sequencelabeling/slRun.py
|
| 436 |
+
### 数据准备
|
| 437 |
+
* SPAN格式: NER任务, 默认用span格式(jsonl), 参考macro_correct/corpus/sequence_labeling/chinese_symbol的chinese_symbol.dev.span文件
|
| 438 |
+
```
|
| 439 |
+
{'label': [{'type': '0', 'ent': '下', 'pos': [7, 7]}, {'type': '1', 'ent': '林', 'pos': [14, 14]}], 'text': '#桂林山水甲天下阳朔山水甲桂林'}
|
| 440 |
+
{'label': [{'type': '11', 'ent': 'o', 'pos': [5, 5]}, {'type': '0', 'ent': 't', 'pos': [12, 12]}, {'type': '1', 'ent': '包', 'pos': [19, 19]}], 'text': '#macrocorrect文本纠错工具包'}
|
| 441 |
+
```
|
| 442 |
+
* CONLL格式: 生成SPAN格式后, 用macro_correct/tet/corpus/pos_to_conll.py转换一下就好
|
| 443 |
+
```
|
| 444 |
+
神 O
|
| 445 |
+
秘 O
|
| 446 |
+
宝 O
|
| 447 |
+
藏 B-1
|
| 448 |
+
在 O
|
| 449 |
+
旅 O
|
| 450 |
+
途 O
|
| 451 |
+
中 B-0
|
| 452 |
+
他 O
|
| 453 |
+
```
|
| 454 |
+
### 配置-训练-验证-预测
|
| 455 |
+
#### 配置
|
| 456 |
+
配置好数据地址和超参, 参考macro_correct/pytorch_user_models/csc/macbert4mdcspell/config.py
|
| 457 |
+
#### 训练-验证-预测
|
| 458 |
+
```
|
| 459 |
+
训练
|
| 460 |
+
nohup python train_yield.py > tc.train_yield.py.log 2>&1 &
|
| 461 |
+
tail -n 1000 -f tc.train_yield.py.log
|
| 462 |
+
验证
|
| 463 |
+
python eval_std.py
|
| 464 |
+
预测
|
| 465 |
+
python predict.py
|
| 466 |
+
```
|
| 467 |
+
|
| 468 |
+
|
| 469 |
+
# 测评
|
| 470 |
+
## 说明
|
| 471 |
+
* 所有训练数据均来自公网或开源数据, 训练数据为1千万左右, 混淆词典较大;
|
| 472 |
+
* 所有测试数据均来自公网或开源数据, 测评数据地址为[Macropodus/csc_eval_public](https://huggingface.co/datasets/Macropodus/csc_eval_public);
|
| 473 |
+
* 测评代码主要为[tcEval.py](https://github.com/yongzhuo/macro-correct/macro_correct/pytorch_textcorrection/tcEval.py); 其中[qwen25_1-5b_pycorrector]()的测评代码在目录[eval](https://github.com/yongzhuo/macro-correct/tet/eval)
|
| 474 |
+
* 评估标准:过纠率(过度纠错, 即高质量正确句子的错误纠正); 句子级宽松标准的准确率/精确率/召回率/F1(同[shibing624/pycorrector](https://github.com/shibing624/pycorrector)); 句子级严格标准的准确率/精确率/召回率/F1(同[wangwang110/CSC](https://github.com/wangwang110/CSC)); 字符级别的准确率/精确率/召回率/F1(错别字);
|
| 475 |
+
* qwen25_1-5b_pycorrector权重地址在[shibing624/chinese-text-correction-1.5b](https://huggingface.co/shibing624/chinese-text-correction-1.5b)
|
| 476 |
+
* macbert4csc_pycorrector权重地址在[shibing624/macbert4csc-base-chinese](https://huggingface.co/shibing624/macbert4csc-base-chinese);
|
| 477 |
+
* macbert4mdcspell_v1权重地址在[Macropodus/macbert4mdcspell_v1](https://huggingface.co/Macropodus/macbert4mdcspell_v1);
|
| 478 |
+
* macbert4mdcspell_v2权重地址在[Macropodus/macbert4mdcspell_v2](https://huggingface.co/Macropodus/macbert4mdcspell_v2);
|
| 479 |
+
* macbert4csc_v2权重地址在[Macropodus/macbert4csc_v2](https://huggingface.co/Macropodus/macbert4csc_v2);
|
| 480 |
+
* macbert4csc_v1权重地址在[Macropodus/macbert4csc_v1](https://huggingface.co/Macropodus/macbert4csc_v1);
|
| 481 |
+
* bert4csc_v1权重地址在[Macropodus/bert4csc_v1](https://huggingface.co/Macropodus/bert4csc_v1);
|
| 482 |
+
|
| 483 |
+
## 3.1 测评数据
|
| 484 |
+
```
|
| 485 |
+
1.gen_de3.json(5545): '的地得'纠错, 由人民日报/学习强国/chinese-poetry等高质量数据人工生成;
|
| 486 |
+
2.lemon_v2.tet.json(1053): relm论文提出的数据, 多领域拼写纠错数据集(7个领域), ; 包括game(GAM), encyclopedia (ENC), contract (COT), medical care(MEC), car (CAR), novel (NOV), and news (NEW)等领域;
|
| 487 |
+
3.acc_rmrb.tet.json(4636): 来自NER-199801(人民日报高质量语料);
|
| 488 |
+
4.acc_xxqg.tet.json(5000): 来自学习强国网站的高质量语料;
|
| 489 |
+
5.gen_passage.tet.json(10000): 源数据为qwen生成的好词好句, 由几乎所有的开源数据汇总的混淆词典生成;
|
| 490 |
+
6.textproof.tet.json(1447): NLP竞赛数据, TextProofreadingCompetition;
|
| 491 |
+
7.gen_xxqg.tet.json(5000): 源数据为学习强国网站的高质量语料, 由几乎所有的开源数据汇总的混淆词典生成;
|
| 492 |
+
8.faspell.dev.json(1000): 视频字幕通过OCR后获取的数据集; 来自爱奇艺的论文faspell;
|
| 493 |
+
9.lomo_tet.json(5000): 主要为音似中文拼写纠错数据集; 来��腾讯; 人工标注的数据集CSCD-NS;
|
| 494 |
+
10.mcsc_tet.5000.json(5000): 医学拼写纠错; 来自腾讯医典APP的真实历史日志; 注意论文说该数据集只关注医学实体的纠错, 常用字等的纠错并不关注;
|
| 495 |
+
11.ecspell.dev.json(1500): 来自ECSpell论文, 包括(law/med/gov)等三个领域;
|
| 496 |
+
12.sighan2013.dev.json(1000): 来自sighan13会议;
|
| 497 |
+
13.sighan2014.dev.json(1062): 来自sighan14会议;
|
| 498 |
+
14.sighan2015.dev.json(1100): 来自sighan15会议;
|
| 499 |
+
```
|
| 500 |
+
|
| 501 |
+
## 3.2 测评再说明
|
| 502 |
+
```
|
| 503 |
+
1.数据预处理, 测评数据都经过 全角转半角,繁简转化,标点符号标准化等操作;
|
| 504 |
+
2.指标带common的极为宽松指标, 同开源项目pycorrector的评估指标;
|
| 505 |
+
3.指标带strict的极为严格指标, 同开源项目[wangwang110/CSC](https://github.com/wangwang110/CSC);
|
| 506 |
+
4.macbert4mdcspell_v1/v2模型为训练使用mdcspell架构+bert的mlm-loss, 但是推理的时候只用bert-mlm;
|
| 507 |
+
5.acc_rmrb/acc_xxqg数据集没有错误, 用于评估模型的误纠率(过度纠错);
|
| 508 |
+
6.qwen25_1-5b_pycorrector的模型为shibing624/chinese-text-correction-1.5b, 其训练数据包括了lemon_v2/mcsc_tet/ecspell的验证集和测试集, 其他的bert类模型的训练不包括验证集和测试集;
|
| 509 |
+
```
|
| 510 |
+
|
| 511 |
+
## 3.3 测评结果
|
| 512 |
+
### 3.3.1 F1(common_cor_f1)
|
| 513 |
+
| model/common_cor_f1 | avg| gen_de3| lemon_v2| gen_passage| text_proof| gen_xxqg| faspell| lomo_tet| mcsc_tet| ecspell| sighan2013| sighan2014| sighan2015 |
|
| 514 |
+
|:------------------------|:-----------------|:-----------------|:-----------------|:-----------------|:-----------------|:-----------------|:-----------------|:-----------------|:-----------------|:-----------------|:-----------------|:-----------------|:-----------------|
|
| 515 |
+
| macbert4csc_pycorrector | 45.8| 42.44| 42.89| 31.49| 46.31| 26.06| 32.7| 44.83| 27.93| 55.51| 70.89| 61.72| 66.81 |
|
| 516 |
+
| qwen25_1-5b_pycorrector | 45.11| 27.29| 89.48| 14.61| 83.9| 13.84| 18.2| 36.71| 96.29| 88.2| 36.41| 15.64| 20.73 |
|
| 517 |
+
| bert4csc_v1 | 62.28| 93.73| 61.99| 44.79| 68.0| 35.03| 48.28| 61.8| 64.41| 79.11| 77.66| 51.01| 61.54 |
|
| 518 |
+
| macbert4csc_v1 | 68.55| 96.67| 65.63| 48.4| 75.65| 38.43| 51.76| 70.11| 80.63| 85.55| 81.38| 57.63| 70.7 |
|
| 519 |
+
| macbert4csc_v2 | 68.6| 96.74| 66.02| 48.26| 75.78| 38.84| 51.91| 70.17| 80.71| 85.61| 80.97| 58.22| 69.95 |
|
| 520 |
+
| macbert4mdcspell_v1 | 71.1| 96.42| 70.06| 52.55| 79.61| 43.37| 53.85| 70.9| 82.38| 87.46| 84.2| 61.08| 71.32 |
|
| 521 |
+
| macbert4mdcspell_v2 | 71.23| 96.42| 65.8| 52.35| 75.94| 43.5| 53.82| 72.66| 82.28| 88.69| 82.51| 65.59| 75.26 |
|
| 522 |
+
|
| 523 |
+
### 3.3.2 acc(common_cor_acc)
|
| 524 |
+
| model/common_cor_acc| avg| gen_de3| lemon_v2| gen_passage| text_proof| gen_xxqg| faspell| lomo_tet| mcsc_tet| ecspell| sighan2013| sighan2014| sighan2015 |
|
| 525 |
+
|:-----------------|:-----------------|:-----------------|:-----------------|:-----------------|:-----------------|:-----------------|:-----------------|:-----------------|:-----------------|:-----------------|:-----------------|:-----------------|:-----------------|
|
| 526 |
+
| macbert4csc_pycorrector| 48.26| 26.96| 28.68| 34.16| 55.29| 28.38| 22.2| 60.96| 57.16| 67.73| 55.9| 68.93| 72.73 |
|
| 527 |
+
| qwen25_1-5b_pycorrector| 46.09| 15.82| 81.29| 22.96| 82.17| 19.04| 12.8| 50.2| 96.4| 89.13| 22.8| 27.87| 32.55 |
|
| 528 |
+
| bert4csc_v1| 60.76| 88.21| 45.96| 43.13| 68.97| 35.0| 34.0| 65.86| 73.26| 81.8| 64.5| 61.11| 67.27 |
|
| 529 |
+
| macbert4csc_v1| 65.34| 93.56| 49.76| 44.98| 74.64| 36.1| 37.0| 73.0| 83.6| 86.87| 69.2| 62.62| 72.73 |
|
| 530 |
+
| macbert4csc_v2| 65.22| 93.69| 50.14| 44.92| 74.64| 36.26| 37.0| 72.72| 83.66| 86.93| 68.5| 62.43| 71.73 |
|
| 531 |
+
| macbert4mdcspell_v1| 67.15| 93.09| 54.8| 47.71| 78.09| 39.52| 38.8| 71.92| 84.78| 88.27| 73.2| 63.28| 72.36 |
|
| 532 |
+
| macbert4mdcspell_v2 | 68.31| 93.09| 50.05| 48.72| 75.74| 40.52| 38.9| 76.9| 84.8| 89.73| 71.0| 71.94| 78.36 |
|
| 533 |
+
|
| 534 |
+
### 3.3.3 acc(acc_true, thr=0.75)
|
| 535 |
+
| model/acc | avg| acc_rmrb| acc_xxqg |
|
| 536 |
+
|:------------------------|:-----------------|:-----------------|:-----------------|
|
| 537 |
+
| macbert4csc_pycorrector | 99.24| 99.22| 99.26 |
|
| 538 |
+
| qwen25_1-5b_pycorrector | 82.0| 77.14| 86.86 |
|
| 539 |
+
| bert4csc_v1 | 98.71| 98.36| 99.06 |
|
| 540 |
+
| macbert4csc_v1 | 97.72| 96.72| 98.72 |
|
| 541 |
+
| macbert4csc_v2 | 97.89| 96.98| 98.8 |
|
| 542 |
+
| macbert4mdcspell_v1 | 97.75| 96.51| 98.98 |
|
| 543 |
+
| macbert4mdcspell_v2 | 99.54| 99.22| 99.86 |
|
| 544 |
+
|
| 545 |
+
|
| 546 |
+
### 3.3.4 结论(Conclusion)
|
| 547 |
+
```
|
| 548 |
+
1.macbert4csc_v1/macbert4csc_v2/macbert4mdcspell_v1等模型使用多种领域数据训练, 比较均衡, 也适合作为第一步的预训练模型, 可用于专有领域数据的继续微调;
|
| 549 |
+
2.比较macbert4csc_pycorrector/bertbase4csc_v1/macbert4csc_v2/macbert4mdcspell_v1, 观察表2.3, 可以发现训练数据越多, 准确率提升的同时, 误纠率也会稍微高一些;
|
| 550 |
+
3.MFT(Mask-Correct)依旧有效, 不过对于数据量足够的情形提升不明显, 可能也是误纠率升高的一个重要原因;
|
| 551 |
+
4.训练数据中也存在文言文数据, 训练好的模型也支持文言文纠错;
|
| 552 |
+
5.训练好的模型对"地得的"等高频错误具有较高的识别率和��错率;
|
| 553 |
+
6.macbert4mdcspell_v2的MFT只70%的时间no-error-mask(0.15), 15%的时间target-to-target, 15%的时间不mask;
|
| 554 |
+
```
|
| 555 |
+
|
| 556 |
+
|
| 557 |
+
# 日志
|
| 558 |
+
```
|
| 559 |
+
1. v20240129, 完成csc_punct模块;
|
| 560 |
+
2. v20241001, 完成csc_token模块;
|
| 561 |
+
3. v20250117, 完成csc_eval模块;
|
| 562 |
+
4. v20250501, 完成macbert4mdcspell_v2
|
| 563 |
+
```
|
| 564 |
+
|
| 565 |
+
|
| 566 |
+
# 参考
|
| 567 |
+
This library is inspired by and references following frameworks and papers.
|
| 568 |
+
|
| 569 |
+
* Chinese-text-correction-papers: [nghuyong/Chinese-text-correction-papers](https://github.com/nghuyong/Chinese-text-correction-papers)
|
| 570 |
+
* pycorrector: [shibing624/pycorrector](https://github.com/shibing624/pycorrector)
|
| 571 |
+
* CTCResources: [destwang/CTCResources](https://github.com/destwang/CTCResources)
|
| 572 |
+
* CSC: [wangwang110/CSC](https://github.com/wangwang110/CSC)
|
| 573 |
+
* char-similar: [yongzhuo/char-similar](https://github.com/yongzhuo/char-similar)
|
| 574 |
+
* MDCSpell: [iioSnail/MDCSpell_pytorch](https://github.com/iioSnail/MDCSpell_pytorch)
|
| 575 |
+
* CSCD-NS: [nghuyong/cscd-ns](https://github.com/nghuyong/cscd-ns)
|
| 576 |
+
* lemon: [gingasan/lemon](https://github.com/gingasan/lemon)
|
| 577 |
+
* ReLM: [Claude-Liu/ReLM](https://github.com/Claude-Liu/ReLM)
|
| 578 |
+
|
| 579 |
+
|
| 580 |
+
# 论文
|
| 581 |
+
## 中文拼写纠错(CSC, Chinese Spelling Correction)
|
| 582 |
+
* 共收录34篇论文, 写了一个简短的综述. 详见[README.csc_survey.md](https://github.com/yongzhuo/macro-correct/blob/master/README.csc_survey.md)
|
| 583 |
+
|
| 584 |
+
|
| 585 |
+
# Cite
|
| 586 |
+
For citing this work, you can refer to the present GitHub project. For example, with BibTeX:
|
| 587 |
+
```
|
| 588 |
+
@software{macro-correct,
|
| 589 |
+
url = {https://github.com/yongzhuo/macro-correct},
|
| 590 |
+
author = {Yongzhuo Mo},
|
| 591 |
+
title = {macro-correct},
|
| 592 |
+
year = {2025}
|
| 593 |
+
|
| 594 |
+
```
|
| 595 |
+
|
| 596 |
+
|
config.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"BertForMaskedLM"
|
| 4 |
+
],
|
| 5 |
+
"attention_probs_dropout_prob": 0.1,
|
| 6 |
+
"classifier_dropout": null,
|
| 7 |
+
"directionality": "bidi",
|
| 8 |
+
"gradient_checkpointing": true,
|
| 9 |
+
"hidden_act": "gelu",
|
| 10 |
+
"hidden_dropout_prob": 0.1,
|
| 11 |
+
"hidden_size": 768,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"intermediate_size": 3072,
|
| 14 |
+
"layer_norm_eps": 1e-12,
|
| 15 |
+
"max_position_embeddings": 512,
|
| 16 |
+
"model_type": "bert",
|
| 17 |
+
"num_attention_heads": 12,
|
| 18 |
+
"num_hidden_layers": 12,
|
| 19 |
+
"pad_token_id": 0,
|
| 20 |
+
"pooler_fc_size": 768,
|
| 21 |
+
"pooler_num_attention_heads": 12,
|
| 22 |
+
"pooler_num_fc_layers": 3,
|
| 23 |
+
"pooler_size_per_head": 128,
|
| 24 |
+
"pooler_type": "first_token_transform",
|
| 25 |
+
"position_embedding_type": "absolute",
|
| 26 |
+
"transformers_version": "4.30.2",
|
| 27 |
+
"type_vocab_size": 2,
|
| 28 |
+
"use_cache": true,
|
| 29 |
+
"vocab_size": 21128
|
| 30 |
+
}
|
csc.config
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"CUDA_VISIBLE_DEVICES": "0",
|
| 3 |
+
"USE_TORCH": "1",
|
| 4 |
+
"output_hidden_states": null,
|
| 5 |
+
"pretrained_model_name_or_path": "hfl/chinese-macbert-base",
|
| 6 |
+
"model_save_path": "../output/text_correction/model_public_csc",
|
| 7 |
+
"config_name": "csc.config",
|
| 8 |
+
"model_name": "pytorch_model.bin",
|
| 9 |
+
"path_train": "csc_public.train.json",
|
| 10 |
+
"path_dev": "csc_public.dev.json",
|
| 11 |
+
"path_tet": "csc_public.tet.json",
|
| 12 |
+
"scheduler_name": "cosine",
|
| 13 |
+
"tokenizer_type": "CHAR",
|
| 14 |
+
"padding_side": "RIGHT",
|
| 15 |
+
"active_type": "RELU",
|
| 16 |
+
"task_type": "CSC",
|
| 17 |
+
"model_type": "BERT",
|
| 18 |
+
"loss_type": "BCE",
|
| 19 |
+
"loss_det_rate": 0.3,
|
| 20 |
+
"max_len_limit": 512,
|
| 21 |
+
"batch_size": 32,
|
| 22 |
+
"num_labels": 0,
|
| 23 |
+
"max_len": 128,
|
| 24 |
+
"epochs": 3,
|
| 25 |
+
"lr": 3e-05,
|
| 26 |
+
"grad_accum_steps": 4,
|
| 27 |
+
"max_grad_norm": 1.0,
|
| 28 |
+
"weight_decay": 0.01,
|
| 29 |
+
"dropout_rate": 0.1,
|
| 30 |
+
"adam_eps": 1e-08,
|
| 31 |
+
"seed": 42,
|
| 32 |
+
"evaluate_steps": 1000,
|
| 33 |
+
"warmup_steps": 0.1,
|
| 34 |
+
"ignore_index": 0,
|
| 35 |
+
"save_steps": 1000,
|
| 36 |
+
"stop_epochs": 4,
|
| 37 |
+
"num_workers": 0,
|
| 38 |
+
"max_steps": -1,
|
| 39 |
+
"flag_save_model_state": true,
|
| 40 |
+
"flag_dynamic_encode": false,
|
| 41 |
+
"flag_tokenizer_char": true,
|
| 42 |
+
"flag_soft_label": true,
|
| 43 |
+
"flag_save_best": true,
|
| 44 |
+
"flag_dropout": false,
|
| 45 |
+
"flag_shuffle": true,
|
| 46 |
+
"flag_active": false,
|
| 47 |
+
"flag_train": false,
|
| 48 |
+
"flag_cuda": true,
|
| 49 |
+
"flag_mft": true,
|
| 50 |
+
"flag_adv": false,
|
| 51 |
+
"xy_keys_predict": [
|
| 52 |
+
"original_text",
|
| 53 |
+
"correct_text",
|
| 54 |
+
"wrong_ids"
|
| 55 |
+
],
|
| 56 |
+
"keys": [
|
| 57 |
+
"original_text",
|
| 58 |
+
"correct_text",
|
| 59 |
+
"wrong_ids"
|
| 60 |
+
],
|
| 61 |
+
"save_best_mertics_key": [
|
| 62 |
+
"sentence",
|
| 63 |
+
"strict_cor_f1"
|
| 64 |
+
],
|
| 65 |
+
"label_sep": "|myz|",
|
| 66 |
+
"multi_label_threshold": 0.5,
|
| 67 |
+
"len_rate": 1,
|
| 68 |
+
"adv_emb_name": "word_embeddings.",
|
| 69 |
+
"adv_eps": 1.0,
|
| 70 |
+
"additional_special_tokens": [],
|
| 71 |
+
"len_corpus": null,
|
| 72 |
+
"prior_count": null,
|
| 73 |
+
"prior": null,
|
| 74 |
+
"l2i": null,
|
| 75 |
+
"i2l": null,
|
| 76 |
+
"xy_keys": [
|
| 77 |
+
"original_text",
|
| 78 |
+
"correct_text",
|
| 79 |
+
"wrong_ids"
|
| 80 |
+
]
|
| 81 |
+
}
|
pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:874d6fecdaec2596a29177f34c701ddb98bb9650a165a33d69194036b5350a43
|
| 3 |
+
size 530972393
|
special_tokens_map.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cls_token": "[CLS]",
|
| 3 |
+
"mask_token": "[MASK]",
|
| 4 |
+
"pad_token": "[PAD]",
|
| 5 |
+
"sep_token": "[SEP]",
|
| 6 |
+
"unk_token": "[UNK]"
|
| 7 |
+
}
|
tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"clean_up_tokenization_spaces": true,
|
| 3 |
+
"cls_token": "[CLS]",
|
| 4 |
+
"do_lower_case": true,
|
| 5 |
+
"mask_token": "[MASK]",
|
| 6 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 7 |
+
"pad_token": "[PAD]",
|
| 8 |
+
"sep_token": "[SEP]",
|
| 9 |
+
"strip_accents": null,
|
| 10 |
+
"tokenize_chinese_chars": true,
|
| 11 |
+
"tokenizer_class": "BertTokenizer",
|
| 12 |
+
"unk_token": "[UNK]"
|
| 13 |
+
}
|
vocab.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|