divisor.acestep.language_segmentation.utils.num
Rules to verbalize numbers into Chinese characters. https://zh.wikipedia.org/wiki/中文数字#現代中文
1# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# Digital processing from GPT_SoVITS num.py (thanks) 15""" 16Rules to verbalize numbers into Chinese characters. 17https://zh.wikipedia.org/wiki/中文数字#現代中文 18""" 19 20import re 21from collections import OrderedDict 22from typing import List 23 24DIGITS = {str(i): tran for i, tran in enumerate("零一二三四五六七八九")} 25UNITS = OrderedDict( 26 { 27 1: "十", 28 2: "百", 29 3: "千", 30 4: "万", 31 8: "亿", 32 } 33) 34 35COM_QUANTIFIERS = "(处|台|架|枚|趟|幅|平|方|堵|间|床|株|批|项|例|列|篇|栋|注|亩|封|艘|把|目|套|段|人|所|朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|小时|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|十|)吨|(亿|千万|百万|万|千|百|)块|角|毛|分)" 36 37# 分数表达式 38RE_FRAC = re.compile(r"(-?)(\d+)/(\d+)") 39 40 41def replace_frac(match) -> str: 42 """ 43 Args: 44 match (re.Match) 45 Returns: 46 str 47 """ 48 sign = match.group(1) 49 nominator = match.group(2) 50 denominator = match.group(3) 51 sign: str = "负" if sign else "" 52 nominator: str = num2str(nominator) 53 denominator: str = num2str(denominator) 54 result = f"{sign}{denominator}分之{nominator}" 55 return result 56 57 58# 百分数表达式 59RE_PERCENTAGE = re.compile(r"(-?)(\d+(\.\d+)?)%") 60 61 62def replace_percentage(match) -> str: 63 """ 64 Args: 65 match (re.Match) 66 Returns: 67 str 68 """ 69 sign = match.group(1) 70 percent = match.group(2) 71 sign: str = "负" if sign else "" 72 percent: str = num2str(percent) 73 result = f"{sign}百分之{percent}" 74 return result 75 76 77# 整数表达式 78# 带负号的整数 -10 79RE_INTEGER = re.compile(r"(-)" r"(\d+)") 80 81 82def replace_negative_num(match) -> str: 83 """ 84 Args: 85 match (re.Match) 86 Returns: 87 str 88 """ 89 sign = match.group(1) 90 number = match.group(2) 91 sign: str = "负" if sign else "" 92 number: str = num2str(number) 93 result = f"{sign}{number}" 94 return result 95 96 97# 编号-无符号整形 98# 00078 99RE_DEFAULT_NUM = re.compile(r"\d{3}\d*") 100 101 102def replace_default_num(match): 103 """ 104 Args: 105 match (re.Match) 106 Returns: 107 str 108 """ 109 number = match.group(0) 110 return verbalize_digit(number, alt_one=True) 111 112 113# 加减乘除 114# RE_ASMD = re.compile( 115# r'((-?)((\d+)(\.\d+)?)|(\.(\d+)))([\+\-\×÷=])((-?)((\d+)(\.\d+)?)|(\.(\d+)))') 116RE_ASMD = re.compile( 117 r"((-?)((\d+)(\.\d+)?[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|(\.\d+[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|([A-Za-z][⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*))([\+\-\×÷=])((-?)((\d+)(\.\d+)?[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|(\.\d+[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|([A-Za-z][⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*))" 118) 119 120asmd_map = {"+": "加", "-": "减", "×": "乘", "÷": "除", "=": "等于"} 121 122 123def replace_asmd(match) -> str: 124 """ 125 Args: 126 match (re.Match) 127 Returns: 128 str 129 """ 130 result = match.group(1) + asmd_map[match.group(8)] + match.group(9) 131 return result 132 133 134# 次方专项 135RE_POWER = re.compile(r"[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]+") 136 137power_map = { 138 "⁰": "0", 139 "¹": "1", 140 "²": "2", 141 "³": "3", 142 "⁴": "4", 143 "⁵": "5", 144 "⁶": "6", 145 "⁷": "7", 146 "⁸": "8", 147 "⁹": "9", 148 "ˣ": "x", 149 "ʸ": "y", 150 "ⁿ": "n", 151} 152 153 154def replace_power(match) -> str: 155 """ 156 Args: 157 match (re.Match) 158 Returns: 159 str 160 """ 161 power_num = "" 162 for m in match.group(0): 163 power_num += power_map[m] 164 result = "的" + power_num + "次方" 165 return result 166 167 168# 数字表达式 169# 纯小数 170RE_DECIMAL_NUM = re.compile(r"(-?)((\d+)(\.\d+))" r"|(\.(\d+))") 171# 正整数 + 量词 172RE_POSITIVE_QUANTIFIERS = re.compile(r"(\d+)([多余几\+])?" + COM_QUANTIFIERS) 173RE_NUMBER = re.compile(r"(-?)((\d+)(\.\d+)?)" r"|(\.(\d+))") 174 175 176def replace_positive_quantifier(match) -> str: 177 """ 178 Args: 179 match (re.Match) 180 Returns: 181 str 182 """ 183 number = match.group(1) 184 match_2 = match.group(2) 185 if match_2 == "+": 186 match_2 = "多" 187 match_2: str = match_2 if match_2 else "" 188 quantifiers: str = match.group(3) 189 number: str = num2str(number) 190 result = f"{number}{match_2}{quantifiers}" 191 return result 192 193 194def replace_number(match) -> str: 195 """ 196 Args: 197 match (re.Match) 198 Returns: 199 str 200 """ 201 sign = match.group(1) 202 number = match.group(2) 203 pure_decimal = match.group(5) 204 if pure_decimal: 205 result = num2str(pure_decimal) 206 else: 207 sign: str = "负" if sign else "" 208 number: str = num2str(number) 209 result = f"{sign}{number}" 210 return result 211 212 213# 范围表达式 214# match.group(1) and match.group(8) are copy from RE_NUMBER 215 216RE_RANGE = re.compile( 217 r""" 218 (?<![\d\+\-\×÷=]) # 使用反向前瞻以确保数字范围之前没有其他数字和操作符 219 ((-?)((\d+)(\.\d+)?)) # 匹配范围起始的负数或正数(整数或小数) 220 [-~] # 匹配范围分隔符 221 ((-?)((\d+)(\.\d+)?)) # 匹配范围结束的负数或正数(整数或小数) 222 (?![\d\+\-\×÷=]) # 使用正向前瞻以确保数字范围之后没有其他数字和操作符 223 """, 224 re.VERBOSE, 225) 226 227 228def replace_range(match) -> str: 229 """ 230 Args: 231 match (re.Match) 232 Returns: 233 str 234 """ 235 first, second = match.group(1), match.group(6) 236 first = RE_NUMBER.sub(replace_number, first) 237 second = RE_NUMBER.sub(replace_number, second) 238 result = f"{first}到{second}" 239 return result 240 241 242# ~至表达式 243RE_TO_RANGE = re.compile( 244 r"((-?)((\d+)(\.\d+)?)|(\.(\d+)))(%|°C|℃|度|摄氏度|cm2|cm²|cm3|cm³|cm|db|ds|kg|km|m2|m²|m³|m3|ml|m|mm|s)[~]((-?)((\d+)(\.\d+)?)|(\.(\d+)))(%|°C|℃|度|摄氏度|cm2|cm²|cm3|cm³|cm|db|ds|kg|km|m2|m²|m³|m3|ml|m|mm|s)" 245) 246 247 248def replace_to_range(match) -> str: 249 """ 250 Args: 251 match (re.Match) 252 Returns: 253 str 254 """ 255 result = match.group(0).replace("~", "至") 256 return result 257 258 259def _get_value(value_string: str, use_zero: bool = True) -> List[str]: 260 stripped = value_string.lstrip("0") 261 if len(stripped) == 0: 262 return [] 263 elif len(stripped) == 1: 264 if use_zero and len(stripped) < len(value_string): 265 return [DIGITS["0"], DIGITS[stripped]] 266 else: 267 return [DIGITS[stripped]] 268 else: 269 largest_unit = next( 270 power for power in reversed(UNITS.keys()) if power < len(stripped) 271 ) 272 first_part = value_string[:-largest_unit] 273 second_part = value_string[-largest_unit:] 274 return _get_value(first_part) + [UNITS[largest_unit]] + _get_value(second_part) 275 276 277def verbalize_cardinal(value_string: str) -> str: 278 if not value_string: 279 return "" 280 281 # 000 -> '零' , 0 -> '零' 282 value_string = value_string.lstrip("0") 283 if len(value_string) == 0: 284 return DIGITS["0"] 285 286 result_symbols = _get_value(value_string) 287 # verbalized number starting with '一十*' is abbreviated as `十*` 288 if ( 289 len(result_symbols) >= 2 290 and result_symbols[0] == DIGITS["1"] 291 and result_symbols[1] == UNITS[1] 292 ): 293 result_symbols = result_symbols[1:] 294 return "".join(result_symbols) 295 296 297def verbalize_digit(value_string: str, alt_one=False) -> str: 298 result_symbols = [DIGITS[digit] for digit in value_string] 299 result = "".join(result_symbols) 300 if alt_one: 301 result = result.replace("一", "幺") 302 return result 303 304 305def num2str(value_string: str) -> str: 306 integer_decimal = value_string.split(".") 307 if len(integer_decimal) == 1: 308 integer = integer_decimal[0] 309 decimal = "" 310 elif len(integer_decimal) == 2: 311 integer, decimal = integer_decimal 312 else: 313 raise ValueError( 314 f"The value string: '${value_string}' has more than one point in it." 315 ) 316 317 result = verbalize_cardinal(integer) 318 319 decimal = decimal.rstrip("0") 320 if decimal: 321 # '.22' is verbalized as '零点二二' 322 # '3.20' is verbalized as '三点二 323 result = result if result else "零" 324 result += "点" + verbalize_digit(decimal) 325 return result 326 327 328if __name__ == "__main__": 329 330 text = "" 331 text = num2str(text) 332 print(text) 333 pass
DIGITS =
{'0': '零', '1': '一', '2': '二', '3': '三', '4': '四', '5': '五', '6': '六', '7': '七', '8': '八', '9': '九'}
UNITS =
OrderedDict({1: '十', 2: '百', 3: '千', 4: '万', 8: '亿'})
COM_QUANTIFIERS =
'(处|台|架|枚|趟|幅|平|方|堵|间|床|株|批|项|例|列|篇|栋|注|亩|封|艘|把|目|套|段|人|所|朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|小时|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|十|)吨|(亿|千万|百万|万|千|百|)块|角|毛|分)'
RE_FRAC =
re.compile('(-?)(\\d+)/(\\d+)')
def
replace_frac(match) -> str:
42def replace_frac(match) -> str: 43 """ 44 Args: 45 match (re.Match) 46 Returns: 47 str 48 """ 49 sign = match.group(1) 50 nominator = match.group(2) 51 denominator = match.group(3) 52 sign: str = "负" if sign else "" 53 nominator: str = num2str(nominator) 54 denominator: str = num2str(denominator) 55 result = f"{sign}{denominator}分之{nominator}" 56 return result
Args: match (re.Match) Returns: str
RE_PERCENTAGE =
re.compile('(-?)(\\d+(\\.\\d+)?)%')
def
replace_percentage(match) -> str:
63def replace_percentage(match) -> str: 64 """ 65 Args: 66 match (re.Match) 67 Returns: 68 str 69 """ 70 sign = match.group(1) 71 percent = match.group(2) 72 sign: str = "负" if sign else "" 73 percent: str = num2str(percent) 74 result = f"{sign}百分之{percent}" 75 return result
Args: match (re.Match) Returns: str
RE_INTEGER =
re.compile('(-)(\\d+)')
def
replace_negative_num(match) -> str:
83def replace_negative_num(match) -> str: 84 """ 85 Args: 86 match (re.Match) 87 Returns: 88 str 89 """ 90 sign = match.group(1) 91 number = match.group(2) 92 sign: str = "负" if sign else "" 93 number: str = num2str(number) 94 result = f"{sign}{number}" 95 return result
Args: match (re.Match) Returns: str
RE_DEFAULT_NUM =
re.compile('\\d{3}\\d*')
def
replace_default_num(match):
103def replace_default_num(match): 104 """ 105 Args: 106 match (re.Match) 107 Returns: 108 str 109 """ 110 number = match.group(0) 111 return verbalize_digit(number, alt_one=True)
Args: match (re.Match) Returns: str
RE_ASMD =
re.compile('((-?)((\\d+)(\\.\\d+)?[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|(\\.\\d+[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|([A-Za-z][⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*))([\\+\\-\\×÷=])((-?)((\\d+)(\\.\\d+)?[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|(\\.\\d+[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|([A-Za-z][⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)
asmd_map =
{'+': '加', '-': '减', '×': '乘', '÷': '除', '=': '等于'}
def
replace_asmd(match) -> str:
124def replace_asmd(match) -> str: 125 """ 126 Args: 127 match (re.Match) 128 Returns: 129 str 130 """ 131 result = match.group(1) + asmd_map[match.group(8)] + match.group(9) 132 return result
Args: match (re.Match) Returns: str
RE_POWER =
re.compile('[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]+')
power_map =
{'⁰': '0', '¹': '1', '²': '2', '³': '3', '⁴': '4', '⁵': '5', '⁶': '6', '⁷': '7', '⁸': '8', '⁹': '9', 'ˣ': 'x', 'ʸ': 'y', 'ⁿ': 'n'}
def
replace_power(match) -> str:
155def replace_power(match) -> str: 156 """ 157 Args: 158 match (re.Match) 159 Returns: 160 str 161 """ 162 power_num = "" 163 for m in match.group(0): 164 power_num += power_map[m] 165 result = "的" + power_num + "次方" 166 return result
Args: match (re.Match) Returns: str
RE_DECIMAL_NUM =
re.compile('(-?)((\\d+)(\\.\\d+))|(\\.(\\d+))')
RE_POSITIVE_QUANTIFIERS =
re.compile('(\\d+)([多余几\\+])?(处|台|架|枚|趟|幅|平|方|堵|间|床|株|批|项|例|列|篇|栋|注|亩|封|艘|把|目|套|段|人|所|朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管)
RE_NUMBER =
re.compile('(-?)((\\d+)(\\.\\d+)?)|(\\.(\\d+))')
def
replace_positive_quantifier(match) -> str:
177def replace_positive_quantifier(match) -> str: 178 """ 179 Args: 180 match (re.Match) 181 Returns: 182 str 183 """ 184 number = match.group(1) 185 match_2 = match.group(2) 186 if match_2 == "+": 187 match_2 = "多" 188 match_2: str = match_2 if match_2 else "" 189 quantifiers: str = match.group(3) 190 number: str = num2str(number) 191 result = f"{number}{match_2}{quantifiers}" 192 return result
Args: match (re.Match) Returns: str
def
replace_number(match) -> str:
195def replace_number(match) -> str: 196 """ 197 Args: 198 match (re.Match) 199 Returns: 200 str 201 """ 202 sign = match.group(1) 203 number = match.group(2) 204 pure_decimal = match.group(5) 205 if pure_decimal: 206 result = num2str(pure_decimal) 207 else: 208 sign: str = "负" if sign else "" 209 number: str = num2str(number) 210 result = f"{sign}{number}" 211 return result
Args: match (re.Match) Returns: str
RE_RANGE =
re.compile('\n (?<![\\d\\+\\-\\×÷=]) # 使用反向前瞻以确保数字范围之前没有其他数字和操作符\n ((-?)((\\d+)(\\.\\d+)?)) # 匹配范围起始的负数或正数(整数或小数)\n [-~] # 匹配范围分隔符\n ((-?)((\\d+)(\\.\\d+)?)) # 匹配范围结束的负数或正数(整, re.VERBOSE)
def
replace_range(match) -> str:
229def replace_range(match) -> str: 230 """ 231 Args: 232 match (re.Match) 233 Returns: 234 str 235 """ 236 first, second = match.group(1), match.group(6) 237 first = RE_NUMBER.sub(replace_number, first) 238 second = RE_NUMBER.sub(replace_number, second) 239 result = f"{first}到{second}" 240 return result
Args: match (re.Match) Returns: str
RE_TO_RANGE =
re.compile('((-?)((\\d+)(\\.\\d+)?)|(\\.(\\d+)))(%|°C|℃|度|摄氏度|cm2|cm²|cm3|cm³|cm|db|ds|kg|km|m2|m²|m³|m3|ml|m|mm|s)[~]((-?)((\\d+)(\\.\\d+)?)|(\\.(\\d+)))(%|°C|℃|度|摄氏度|cm2|cm²|cm3|cm³|cm|db|ds|kg|km|m2|m²|m³|m3|)
def
replace_to_range(match) -> str:
249def replace_to_range(match) -> str: 250 """ 251 Args: 252 match (re.Match) 253 Returns: 254 str 255 """ 256 result = match.group(0).replace("~", "至") 257 return result
Args: match (re.Match) Returns: str
def
verbalize_cardinal(value_string: str) -> str:
278def verbalize_cardinal(value_string: str) -> str: 279 if not value_string: 280 return "" 281 282 # 000 -> '零' , 0 -> '零' 283 value_string = value_string.lstrip("0") 284 if len(value_string) == 0: 285 return DIGITS["0"] 286 287 result_symbols = _get_value(value_string) 288 # verbalized number starting with '一十*' is abbreviated as `十*` 289 if ( 290 len(result_symbols) >= 2 291 and result_symbols[0] == DIGITS["1"] 292 and result_symbols[1] == UNITS[1] 293 ): 294 result_symbols = result_symbols[1:] 295 return "".join(result_symbols)
def
verbalize_digit(value_string: str, alt_one=False) -> str:
def
num2str(value_string: str) -> str:
306def num2str(value_string: str) -> str: 307 integer_decimal = value_string.split(".") 308 if len(integer_decimal) == 1: 309 integer = integer_decimal[0] 310 decimal = "" 311 elif len(integer_decimal) == 2: 312 integer, decimal = integer_decimal 313 else: 314 raise ValueError( 315 f"The value string: '${value_string}' has more than one point in it." 316 ) 317 318 result = verbalize_cardinal(integer) 319 320 decimal = decimal.rstrip("0") 321 if decimal: 322 # '.22' is verbalized as '零点二二' 323 # '3.20' is verbalized as '三点二 324 result = result if result else "零" 325 result += "点" + verbalize_digit(decimal) 326 return result