divisor.acestep.models.lyrics_utils.lyric_normalizer
1import re 2from opencc import OpenCC 3 4 5t2s_converter = OpenCC("t2s") 6s2t_converter = OpenCC("s2t") 7 8 9EMOJI_PATTERN = re.compile( 10 "[" 11 "\U0001f600-\U0001f64f" # Emoticons 12 "]+", 13 flags=re.UNICODE, 14) 15 16# 创建一个翻译表,用于替换和移除字符 17TRANSLATION_TABLE = str.maketrans( 18 { 19 "-": " ", # 将 '-' 替换为空格 20 ",": None, 21 ".": None, 22 ",": None, 23 "。": None, 24 "!": None, 25 "!": None, 26 "?": None, 27 "?": None, 28 "…": None, 29 ";": None, 30 ";": None, 31 ":": None, 32 ":": None, 33 "\u3000": " ", # 将全角空格替换为空格 34 } 35) 36 37# 替换括号中的内容,包括中括号和小括号 38BACKSLASH_PATTERN = re.compile(r"\(.*?\)|\[.*?\]") 39 40SPACE_PATTERN = re.compile("(?<!^)\s+(?!$)") 41 42 43def normalize_text(text, language, strip=True): 44 """ 45 对文本进行标准化处理,去除标点符号,转为小写(如果适用) 46 """ 47 # Step 1: 替换 '-' 为 ' ' 并移除标点符号 48 text = text.translate(TRANSLATION_TABLE) 49 50 # Step 2: 移除表情符号 51 text = EMOJI_PATTERN.sub("", text) 52 53 # Step 3: 连续空白字符替换为单个空格,首位除外 54 text = SPACE_PATTERN.sub(" ", text) 55 56 # Step 4: 去除首尾空白字符(如果需要) 57 if strip: 58 text = text.strip() 59 60 # Step 5: 转为小写 61 text = text.lower() 62 63 # Step 6: 多语言转换 64 if language == "zh": 65 text = t2s_converter.convert(text) 66 if language == "yue": 67 text = s2t_converter.convert(text) 68 # 其他语言根据需要添加 69 return text
t2s_converter =
<opencc.OpenCC object>
s2t_converter =
<opencc.OpenCC object>
EMOJI_PATTERN =
re.compile('[😀-🙏]+')
TRANSLATION_TABLE =
{45: ' ', 44: None, 46: None, 65292: None, 12290: None, 33: None, 65281: None, 63: None, 65311: None, 8230: None, 59: None, 65307: None, 58: None, 65306: None, 12288: ' '}
BACKSLASH_PATTERN =
re.compile('\\(.*?\\)|\\[.*?\\]')
SPACE_PATTERN =
re.compile('(?<!^)\\s+(?!$)')
def
normalize_text(text, language, strip=True):
44def normalize_text(text, language, strip=True): 45 """ 46 对文本进行标准化处理,去除标点符号,转为小写(如果适用) 47 """ 48 # Step 1: 替换 '-' 为 ' ' 并移除标点符号 49 text = text.translate(TRANSLATION_TABLE) 50 51 # Step 2: 移除表情符号 52 text = EMOJI_PATTERN.sub("", text) 53 54 # Step 3: 连续空白字符替换为单个空格,首位除外 55 text = SPACE_PATTERN.sub(" ", text) 56 57 # Step 4: 去除首尾空白字符(如果需要) 58 if strip: 59 text = text.strip() 60 61 # Step 5: 转为小写 62 text = text.lower() 63 64 # Step 6: 多语言转换 65 if language == "zh": 66 text = t2s_converter.convert(text) 67 if language == "yue": 68 text = s2t_converter.convert(text) 69 # 其他语言根据需要添加 70 return text
对文本进行标准化处理,去除标点符号,转为小写(如果适用)