divisor.acestep.language_segmentation.utils.num

Rules to verbalize numbers into Chinese characters. https://zh.wikipedia.org/wiki/中文数字#現代中文

View Source

  1# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
  2#
  3# Licensed under the Apache License, Version 2.0 (the "License");
  4# you may not use this file except in compliance with the License.
  5# You may obtain a copy of the License at
  6#
  7#     http://www.apache.org/licenses/LICENSE-2.0
  8#
  9# Unless required by applicable law or agreed to in writing, software
 10# distributed under the License is distributed on an "AS IS" BASIS,
 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12# See the License for the specific language governing permissions and
 13# limitations under the License.
 14# Digital processing from GPT_SoVITS num.py （thanks）
 15"""
 16Rules to verbalize numbers into Chinese characters.
 17https://zh.wikipedia.org/wiki/中文数字#現代中文
 18"""
 19
 20import re
 21from collections import OrderedDict
 22from typing import List
 23
 24DIGITS = {str(i): tran for i, tran in enumerate("零一二三四五六七八九")}
 25UNITS = OrderedDict(
 26    {
 27        1: "十",
 28        2: "百",
 29        3: "千",
 30        4: "万",
 31        8: "亿",
 32    }
 33)
 34
 35COM_QUANTIFIERS = "(处|台|架|枚|趟|幅|平|方|堵|间|床|株|批|项|例|列|篇|栋|注|亩|封|艘|把|目|套|段|人|所|朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|小时|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|十|)吨|(亿|千万|百万|万|千|百|)块|角|毛|分)"
 36
 37# 分数表达式
 38RE_FRAC = re.compile(r"(-?)(\d+)/(\d+)")
 39
 40
 41def replace_frac(match) -> str:
 42    """
 43    Args:
 44        match (re.Match)
 45    Returns:
 46        str
 47    """
 48    sign = match.group(1)
 49    nominator = match.group(2)
 50    denominator = match.group(3)
 51    sign: str = "负" if sign else ""
 52    nominator: str = num2str(nominator)
 53    denominator: str = num2str(denominator)
 54    result = f"{sign}{denominator}分之{nominator}"
 55    return result
 56
 57
 58# 百分数表达式
 59RE_PERCENTAGE = re.compile(r"(-?)(\d+(\.\d+)?)%")
 60
 61
 62def replace_percentage(match) -> str:
 63    """
 64    Args:
 65        match (re.Match)
 66    Returns:
 67        str
 68    """
 69    sign = match.group(1)
 70    percent = match.group(2)
 71    sign: str = "负" if sign else ""
 72    percent: str = num2str(percent)
 73    result = f"{sign}百分之{percent}"
 74    return result
 75
 76
 77# 整数表达式
 78# 带负号的整数 -10
 79RE_INTEGER = re.compile(r"(-)" r"(\d+)")
 80
 81
 82def replace_negative_num(match) -> str:
 83    """
 84    Args:
 85        match (re.Match)
 86    Returns:
 87        str
 88    """
 89    sign = match.group(1)
 90    number = match.group(2)
 91    sign: str = "负" if sign else ""
 92    number: str = num2str(number)
 93    result = f"{sign}{number}"
 94    return result
 95
 96
 97# 编号-无符号整形
 98# 00078
 99RE_DEFAULT_NUM = re.compile(r"\d{3}\d*")
100
101
102def replace_default_num(match):
103    """
104    Args:
105        match (re.Match)
106    Returns:
107        str
108    """
109    number = match.group(0)
110    return verbalize_digit(number, alt_one=True)
111
112
113# 加减乘除
114# RE_ASMD = re.compile(
115#     r'((-?)((\d+)(\.\d+)?)|(\.(\d+)))([\+\-\×÷=])((-?)((\d+)(\.\d+)?)|(\.(\d+)))')
116RE_ASMD = re.compile(
117    r"((-?)((\d+)(\.\d+)?[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|(\.\d+[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|([A-Za-z][⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*))([\+\-\×÷=])((-?)((\d+)(\.\d+)?[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|(\.\d+[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|([A-Za-z][⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*))"
118)
119
120asmd_map = {"+": "加", "-": "减", "×": "乘", "÷": "除", "=": "等于"}
121
122
123def replace_asmd(match) -> str:
124    """
125    Args:
126        match (re.Match)
127    Returns:
128        str
129    """
130    result = match.group(1) + asmd_map[match.group(8)] + match.group(9)
131    return result
132
133
134# 次方专项
135RE_POWER = re.compile(r"[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]+")
136
137power_map = {
138    "⁰": "0",
139    "¹": "1",
140    "²": "2",
141    "³": "3",
142    "⁴": "4",
143    "⁵": "5",
144    "⁶": "6",
145    "⁷": "7",
146    "⁸": "8",
147    "⁹": "9",
148    "ˣ": "x",
149    "ʸ": "y",
150    "ⁿ": "n",
151}
152
153
154def replace_power(match) -> str:
155    """
156    Args:
157        match (re.Match)
158    Returns:
159        str
160    """
161    power_num = ""
162    for m in match.group(0):
163        power_num += power_map[m]
164    result = "的" + power_num + "次方"
165    return result
166
167
168# 数字表达式
169# 纯小数
170RE_DECIMAL_NUM = re.compile(r"(-?)((\d+)(\.\d+))" r"|(\.(\d+))")
171# 正整数 + 量词
172RE_POSITIVE_QUANTIFIERS = re.compile(r"(\d+)([多余几\+])?" + COM_QUANTIFIERS)
173RE_NUMBER = re.compile(r"(-?)((\d+)(\.\d+)?)" r"|(\.(\d+))")
174
175
176def replace_positive_quantifier(match) -> str:
177    """
178    Args:
179        match (re.Match)
180    Returns:
181        str
182    """
183    number = match.group(1)
184    match_2 = match.group(2)
185    if match_2 == "+":
186        match_2 = "多"
187    match_2: str = match_2 if match_2 else ""
188    quantifiers: str = match.group(3)
189    number: str = num2str(number)
190    result = f"{number}{match_2}{quantifiers}"
191    return result
192
193
194def replace_number(match) -> str:
195    """
196    Args:
197        match (re.Match)
198    Returns:
199        str
200    """
201    sign = match.group(1)
202    number = match.group(2)
203    pure_decimal = match.group(5)
204    if pure_decimal:
205        result = num2str(pure_decimal)
206    else:
207        sign: str = "负" if sign else ""
208        number: str = num2str(number)
209        result = f"{sign}{number}"
210    return result
211
212
213# 范围表达式
214# match.group(1) and match.group(8) are copy from RE_NUMBER
215
216RE_RANGE = re.compile(
217    r"""
218    (?<![\d\+\-\×÷=])      # 使用反向前瞻以确保数字范围之前没有其他数字和操作符
219    ((-?)((\d+)(\.\d+)?))  # 匹配范围起始的负数或正数（整数或小数）
220    [-~]                   # 匹配范围分隔符
221    ((-?)((\d+)(\.\d+)?))  # 匹配范围结束的负数或正数（整数或小数）
222    (?![\d\+\-\×÷=])       # 使用正向前瞻以确保数字范围之后没有其他数字和操作符
223    """,
224    re.VERBOSE,
225)
226
227
228def replace_range(match) -> str:
229    """
230    Args:
231        match (re.Match)
232    Returns:
233        str
234    """
235    first, second = match.group(1), match.group(6)
236    first = RE_NUMBER.sub(replace_number, first)
237    second = RE_NUMBER.sub(replace_number, second)
238    result = f"{first}到{second}"
239    return result
240
241
242# ~至表达式
243RE_TO_RANGE = re.compile(
244    r"((-?)((\d+)(\.\d+)?)|(\.(\d+)))(%|°C|℃|度|摄氏度|cm2|cm²|cm3|cm³|cm|db|ds|kg|km|m2|m²|m³|m3|ml|m|mm|s)[~]((-?)((\d+)(\.\d+)?)|(\.(\d+)))(%|°C|℃|度|摄氏度|cm2|cm²|cm3|cm³|cm|db|ds|kg|km|m2|m²|m³|m3|ml|m|mm|s)"
245)
246
247
248def replace_to_range(match) -> str:
249    """
250    Args:
251        match (re.Match)
252    Returns:
253        str
254    """
255    result = match.group(0).replace("~", "至")
256    return result
257
258
259def _get_value(value_string: str, use_zero: bool = True) -> List[str]:
260    stripped = value_string.lstrip("0")
261    if len(stripped) == 0:
262        return []
263    elif len(stripped) == 1:
264        if use_zero and len(stripped) < len(value_string):
265            return [DIGITS["0"], DIGITS[stripped]]
266        else:
267            return [DIGITS[stripped]]
268    else:
269        largest_unit = next(
270            power for power in reversed(UNITS.keys()) if power < len(stripped)
271        )
272        first_part = value_string[:-largest_unit]
273        second_part = value_string[-largest_unit:]
274        return _get_value(first_part) + [UNITS[largest_unit]] + _get_value(second_part)
275
276
277def verbalize_cardinal(value_string: str) -> str:
278    if not value_string:
279        return ""
280
281    # 000 -> '零' , 0 -> '零'
282    value_string = value_string.lstrip("0")
283    if len(value_string) == 0:
284        return DIGITS["0"]
285
286    result_symbols = _get_value(value_string)
287    # verbalized number starting with '一十*' is abbreviated as `十*`
288    if (
289        len(result_symbols) >= 2
290        and result_symbols[0] == DIGITS["1"]
291        and result_symbols[1] == UNITS[1]
292    ):
293        result_symbols = result_symbols[1:]
294    return "".join(result_symbols)
295
296
297def verbalize_digit(value_string: str, alt_one=False) -> str:
298    result_symbols = [DIGITS[digit] for digit in value_string]
299    result = "".join(result_symbols)
300    if alt_one:
301        result = result.replace("一", "幺")
302    return result
303
304
305def num2str(value_string: str) -> str:
306    integer_decimal = value_string.split(".")
307    if len(integer_decimal) == 1:
308        integer = integer_decimal[0]
309        decimal = ""
310    elif len(integer_decimal) == 2:
311        integer, decimal = integer_decimal
312    else:
313        raise ValueError(
314            f"The value string: '${value_string}' has more than one point in it."
315        )
316
317    result = verbalize_cardinal(integer)
318
319    decimal = decimal.rstrip("0")
320    if decimal:
321        # '.22' is verbalized as '零点二二'
322        # '3.20' is verbalized as '三点二
323        result = result if result else "零"
324        result += "点" + verbalize_digit(decimal)
325    return result
326
327
328if __name__ == "__main__":
329
330    text = ""
331    text = num2str(text)
332    print(text)
333    pass

DIGITS = {'0': '零', '1': '一', '2': '二', '3': '三', '4': '四', '5': '五', '6': '六', '7': '七', '8': '八', '9': '九'}

UNITS = OrderedDict({1: '十', 2: '百', 3: '千', 4: '万', 8: '亿'})

COM_QUANTIFIERS = '(处|台|架|枚|趟|幅|平|方|堵|间|床|株|批|项|例|列|篇|栋|注|亩|封|艘|把|目|套|段|人|所|朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|小时|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|十|)吨|(亿|千万|百万|万|千|百|)块|角|毛|分)'

RE_FRAC = re.compile('(-?)(\\d+)/(\\d+)')

def replace_frac(match) -> str: View Source

42def replace_frac(match) -> str:
43    """
44    Args:
45        match (re.Match)
46    Returns:
47        str
48    """
49    sign = match.group(1)
50    nominator = match.group(2)
51    denominator = match.group(3)
52    sign: str = "负" if sign else ""
53    nominator: str = num2str(nominator)
54    denominator: str = num2str(denominator)
55    result = f"{sign}{denominator}分之{nominator}"
56    return result