divisor.acestep.models.lyrics_utils.lyric_normalizer

 1import re
 2from opencc import OpenCC
 3
 4
 5t2s_converter = OpenCC("t2s")
 6s2t_converter = OpenCC("s2t")
 7
 8
 9EMOJI_PATTERN = re.compile(
10    "["
11    "\U0001f600-\U0001f64f"  # Emoticons
12    "]+",
13    flags=re.UNICODE,
14)
15
16# 创建一个翻译表,用于替换和移除字符
17TRANSLATION_TABLE = str.maketrans(
18    {
19        "-": " ",  # 将 '-' 替换为空格
20        ",": None,
21        ".": None,
22        ",": None,
23        "。": None,
24        "!": None,
25        "!": None,
26        "?": None,
27        "?": None,
28        "…": None,
29        ";": None,
30        ";": None,
31        ":": None,
32        ":": None,
33        "\u3000": " ",  # 将全角空格替换为空格
34    }
35)
36
37# 替换括号中的内容,包括中括号和小括号
38BACKSLASH_PATTERN = re.compile(r"\(.*?\)|\[.*?\]")
39
40SPACE_PATTERN = re.compile("(?<!^)\s+(?!$)")
41
42
43def normalize_text(text, language, strip=True):
44    """
45    对文本进行标准化处理,去除标点符号,转为小写(如果适用)
46    """
47    # Step 1: 替换 '-' 为 ' ' 并移除标点符号
48    text = text.translate(TRANSLATION_TABLE)
49
50    # Step 2: 移除表情符号
51    text = EMOJI_PATTERN.sub("", text)
52
53    # Step 3: 连续空白字符替换为单个空格,首位除外
54    text = SPACE_PATTERN.sub(" ", text)
55
56    # Step 4: 去除首尾空白字符(如果需要)
57    if strip:
58        text = text.strip()
59
60    # Step 5: 转为小写
61    text = text.lower()
62
63    # Step 6: 多语言转换
64    if language == "zh":
65        text = t2s_converter.convert(text)
66    if language == "yue":
67        text = s2t_converter.convert(text)
68    # 其他语言根据需要添加
69    return text
t2s_converter = <opencc.OpenCC object>
s2t_converter = <opencc.OpenCC object>
EMOJI_PATTERN = re.compile('[😀-🙏]+')
TRANSLATION_TABLE = {45: ' ', 44: None, 46: None, 65292: None, 12290: None, 33: None, 65281: None, 63: None, 65311: None, 8230: None, 59: None, 65307: None, 58: None, 65306: None, 12288: ' '}
BACKSLASH_PATTERN = re.compile('\\(.*?\\)|\\[.*?\\]')
SPACE_PATTERN = re.compile('(?<!^)\\s+(?!$)')
def normalize_text(text, language, strip=True):
44def normalize_text(text, language, strip=True):
45    """
46    对文本进行标准化处理,去除标点符号,转为小写(如果适用)
47    """
48    # Step 1: 替换 '-' 为 ' ' 并移除标点符号
49    text = text.translate(TRANSLATION_TABLE)
50
51    # Step 2: 移除表情符号
52    text = EMOJI_PATTERN.sub("", text)
53
54    # Step 3: 连续空白字符替换为单个空格,首位除外
55    text = SPACE_PATTERN.sub(" ", text)
56
57    # Step 4: 去除首尾空白字符(如果需要)
58    if strip:
59        text = text.strip()
60
61    # Step 5: 转为小写
62    text = text.lower()
63
64    # Step 6: 多语言转换
65    if language == "zh":
66        text = t2s_converter.convert(text)
67    if language == "yue":
68        text = s2t_converter.convert(text)
69    # 其他语言根据需要添加
70    return text

对文本进行标准化处理,去除标点符号,转为小写(如果适用)