divisor.acestep.models.lyrics_utils.lyric_tokenizer
1import os 2import re 3import textwrap 4from functools import cached_property 5 6import pypinyin 7import torch 8from hangul_romanize import Transliter 9from hangul_romanize.rule import academic 10from num2words import num2words 11from spacy.lang.ar import Arabic 12from spacy.lang.en import English 13from spacy.lang.es import Spanish 14from spacy.lang.ja import Japanese 15from spacy.lang.zh import Chinese 16from tokenizers import Tokenizer 17 18from .zh_num2words import TextNorm as zh_num2words 19from typing import Dict, List, Optional, Set, Union 20 21 22# copy from https://github.com/coqui-ai/TTS/blob/dbf1a08a0d4e47fdad6172e433eeb34bc6b13b4e/TTS/tts/layers/xtts/tokenizer.py 23def get_spacy_lang(lang): 24 if lang == "zh": 25 return Chinese() 26 elif lang == "ja": 27 return Japanese() 28 elif lang == "ar": 29 return Arabic() 30 elif lang == "es": 31 return Spanish() 32 else: 33 # For most languages, Enlish does the job 34 return English() 35 36 37def split_sentence(text, lang, text_split_length=250): 38 """Preprocess the input text""" 39 text_splits = [] 40 if text_split_length is not None and len(text) >= text_split_length: 41 text_splits.append("") 42 nlp = get_spacy_lang(lang) 43 nlp.add_pipe("sentencizer") 44 doc = nlp(text) 45 for sentence in doc.sents: 46 if len(text_splits[-1]) + len(str(sentence)) <= text_split_length: 47 # if the last sentence + the current sentence is less than the text_split_length 48 # then add the current sentence to the last sentence 49 text_splits[-1] += " " + str(sentence) 50 text_splits[-1] = text_splits[-1].lstrip() 51 elif len(str(sentence)) > text_split_length: 52 # if the current sentence is greater than the text_split_length 53 for line in textwrap.wrap( 54 str(sentence), 55 width=text_split_length, 56 drop_whitespace=True, 57 break_on_hyphens=False, 58 tabsize=1, 59 ): 60 text_splits.append(str(line)) 61 else: 62 text_splits.append(str(sentence)) 63 64 if len(text_splits) > 1: 65 if text_splits[0] == "": 66 del text_splits[0] 67 else: 68 text_splits = [text.lstrip()] 69 70 return text_splits 71 72 73_whitespace_re = re.compile(r"\s+") 74 75# List of (regular expression, replacement) pairs for abbreviations: 76_abbreviations = { 77 "en": [ 78 (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) 79 for x in [ 80 ("mrs", "misess"), 81 ("mr", "mister"), 82 ("dr", "doctor"), 83 ("st", "saint"), 84 ("co", "company"), 85 ("jr", "junior"), 86 ("maj", "major"), 87 ("gen", "general"), 88 ("drs", "doctors"), 89 ("rev", "reverend"), 90 ("lt", "lieutenant"), 91 ("hon", "honorable"), 92 ("sgt", "sergeant"), 93 ("capt", "captain"), 94 ("esq", "esquire"), 95 ("ltd", "limited"), 96 ("col", "colonel"), 97 ("ft", "fort"), 98 ] 99 ], 100 "es": [ 101 (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) 102 for x in [ 103 ("sra", "señora"), 104 ("sr", "señor"), 105 ("dr", "doctor"), 106 ("dra", "doctora"), 107 ("st", "santo"), 108 ("co", "compañía"), 109 ("jr", "junior"), 110 ("ltd", "limitada"), 111 ] 112 ], 113 "fr": [ 114 (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) 115 for x in [ 116 ("mme", "madame"), 117 ("mr", "monsieur"), 118 ("dr", "docteur"), 119 ("st", "saint"), 120 ("co", "compagnie"), 121 ("jr", "junior"), 122 ("ltd", "limitée"), 123 ] 124 ], 125 "de": [ 126 (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) 127 for x in [ 128 ("fr", "frau"), 129 ("dr", "doktor"), 130 ("st", "sankt"), 131 ("co", "firma"), 132 ("jr", "junior"), 133 ] 134 ], 135 "pt": [ 136 (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) 137 for x in [ 138 ("sra", "senhora"), 139 ("sr", "senhor"), 140 ("dr", "doutor"), 141 ("dra", "doutora"), 142 ("st", "santo"), 143 ("co", "companhia"), 144 ("jr", "júnior"), 145 ("ltd", "limitada"), 146 ] 147 ], 148 "it": [ 149 (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) 150 for x in [ 151 # ("sig.ra", "signora"), 152 ("sig", "signore"), 153 ("dr", "dottore"), 154 ("st", "santo"), 155 ("co", "compagnia"), 156 ("jr", "junior"), 157 ("ltd", "limitata"), 158 ] 159 ], 160 "pl": [ 161 (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) 162 for x in [ 163 ("p", "pani"), 164 ("m", "pan"), 165 ("dr", "doktor"), 166 ("sw", "święty"), 167 ("jr", "junior"), 168 ] 169 ], 170 "ar": [ 171 (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) 172 for x in [ 173 # There are not many common abbreviations in Arabic as in English. 174 ] 175 ], 176 "zh": [ 177 (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) 178 for x in [ 179 # Chinese doesn't typically use abbreviations in the same way as Latin-based scripts. 180 ] 181 ], 182 "cs": [ 183 (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) 184 for x in [ 185 ("dr", "doktor"), # doctor 186 ("ing", "inženýr"), # engineer 187 ("p", "pan"), # Could also map to pani for woman but no easy way to do it 188 # Other abbreviations would be specialized and not as common. 189 ] 190 ], 191 "ru": [ 192 (re.compile("\\b%s\\b" % x[0], re.IGNORECASE), x[1]) 193 for x in [ 194 ("г-жа", "госпожа"), # Mrs. 195 ("г-н", "господин"), # Mr. 196 ("д-р", "доктор"), # doctor 197 # Other abbreviations are less common or specialized. 198 ] 199 ], 200 "nl": [ 201 (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) 202 for x in [ 203 ("dhr", "de heer"), # Mr. 204 ("mevr", "mevrouw"), # Mrs. 205 ("dr", "dokter"), # doctor 206 ("jhr", "jonkheer"), # young lord or nobleman 207 # Dutch uses more abbreviations, but these are the most common ones. 208 ] 209 ], 210 "tr": [ 211 (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) 212 for x in [ 213 ("b", "bay"), # Mr. 214 ("byk", "büyük"), # büyük 215 ("dr", "doktor"), # doctor 216 # Add other Turkish abbreviations here if needed. 217 ] 218 ], 219 "hu": [ 220 (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) 221 for x in [ 222 ("dr", "doktor"), # doctor 223 ("b", "bácsi"), # Mr. 224 ("nőv", "nővér"), # nurse 225 # Add other Hungarian abbreviations here if needed. 226 ] 227 ], 228 "ko": [ 229 (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) 230 for x in [ 231 # Korean doesn't typically use abbreviations in the same way as Latin-based scripts. 232 ] 233 ], 234} 235 236 237def expand_abbreviations_multilingual(text, lang="en"): 238 for regex, replacement in _abbreviations[lang]: 239 text = re.sub(regex, replacement, text) 240 return text 241 242 243_symbols_multilingual = { 244 "en": [ 245 (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) 246 for x in [ 247 ("&", " and "), 248 ("@", " at "), 249 ("%", " percent "), 250 ("#", " hash "), 251 ("$", " dollar "), 252 ("£", " pound "), 253 ("°", " degree "), 254 ] 255 ], 256 "es": [ 257 (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) 258 for x in [ 259 ("&", " y "), 260 ("@", " arroba "), 261 ("%", " por ciento "), 262 ("#", " numeral "), 263 ("$", " dolar "), 264 ("£", " libra "), 265 ("°", " grados "), 266 ] 267 ], 268 "fr": [ 269 (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) 270 for x in [ 271 ("&", " et "), 272 ("@", " arobase "), 273 ("%", " pour cent "), 274 ("#", " dièse "), 275 ("$", " dollar "), 276 ("£", " livre "), 277 ("°", " degrés "), 278 ] 279 ], 280 "de": [ 281 (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) 282 for x in [ 283 ("&", " und "), 284 ("@", " at "), 285 ("%", " prozent "), 286 ("#", " raute "), 287 ("$", " dollar "), 288 ("£", " pfund "), 289 ("°", " grad "), 290 ] 291 ], 292 "pt": [ 293 (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) 294 for x in [ 295 ("&", " e "), 296 ("@", " arroba "), 297 ("%", " por cento "), 298 ("#", " cardinal "), 299 ("$", " dólar "), 300 ("£", " libra "), 301 ("°", " graus "), 302 ] 303 ], 304 "it": [ 305 (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) 306 for x in [ 307 ("&", " e "), 308 ("@", " chiocciola "), 309 ("%", " per cento "), 310 ("#", " cancelletto "), 311 ("$", " dollaro "), 312 ("£", " sterlina "), 313 ("°", " gradi "), 314 ] 315 ], 316 "pl": [ 317 (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) 318 for x in [ 319 ("&", " i "), 320 ("@", " małpa "), 321 ("%", " procent "), 322 ("#", " krzyżyk "), 323 ("$", " dolar "), 324 ("£", " funt "), 325 ("°", " stopnie "), 326 ] 327 ], 328 "ar": [ 329 # Arabic 330 (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) 331 for x in [ 332 ("&", " و "), 333 ("@", " على "), 334 ("%", " في المئة "), 335 ("#", " رقم "), 336 ("$", " دولار "), 337 ("£", " جنيه "), 338 ("°", " درجة "), 339 ] 340 ], 341 "zh": [ 342 # Chinese 343 (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) 344 for x in [ 345 ("&", " 和 "), 346 ("@", " 在 "), 347 ("%", " 百分之 "), 348 ("#", " 号 "), 349 ("$", " 美元 "), 350 ("£", " 英镑 "), 351 ("°", " 度 "), 352 ] 353 ], 354 "cs": [ 355 # Czech 356 (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) 357 for x in [ 358 ("&", " a "), 359 ("@", " na "), 360 ("%", " procento "), 361 ("#", " křížek "), 362 ("$", " dolar "), 363 ("£", " libra "), 364 ("°", " stupně "), 365 ] 366 ], 367 "ru": [ 368 # Russian 369 (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) 370 for x in [ 371 ("&", " и "), 372 ("@", " собака "), 373 ("%", " процентов "), 374 ("#", " номер "), 375 ("$", " доллар "), 376 ("£", " фунт "), 377 ("°", " градус "), 378 ] 379 ], 380 "nl": [ 381 # Dutch 382 (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) 383 for x in [ 384 ("&", " en "), 385 ("@", " bij "), 386 ("%", " procent "), 387 ("#", " hekje "), 388 ("$", " dollar "), 389 ("£", " pond "), 390 ("°", " graden "), 391 ] 392 ], 393 "tr": [ 394 (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) 395 for x in [ 396 ("&", " ve "), 397 ("@", " at "), 398 ("%", " yüzde "), 399 ("#", " diyez "), 400 ("$", " dolar "), 401 ("£", " sterlin "), 402 ("°", " derece "), 403 ] 404 ], 405 "hu": [ 406 (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) 407 for x in [ 408 ("&", " és "), 409 ("@", " kukac "), 410 ("%", " százalék "), 411 ("#", " kettőskereszt "), 412 ("$", " dollár "), 413 ("£", " font "), 414 ("°", " fok "), 415 ] 416 ], 417 "ko": [ 418 # Korean 419 (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) 420 for x in [ 421 ("&", " 그리고 "), 422 ("@", " 에 "), 423 ("%", " 퍼센트 "), 424 ("#", " 번호 "), 425 ("$", " 달러 "), 426 ("£", " 파운드 "), 427 ("°", " 도 "), 428 ] 429 ], 430} 431 432 433def expand_symbols_multilingual(text, lang="en"): 434 for regex, replacement in _symbols_multilingual[lang]: 435 text = re.sub(regex, replacement, text) 436 text = text.replace(" ", " ") # Ensure there are no double spaces 437 return text.strip() 438 439 440_ordinal_re = { 441 "en": re.compile(r"([0-9]+)(st|nd|rd|th)"), 442 "es": re.compile(r"([0-9]+)(º|ª|er|o|a|os|as)"), 443 "fr": re.compile(r"([0-9]+)(º|ª|er|re|e|ème)"), 444 "de": re.compile(r"([0-9]+)(st|nd|rd|th|º|ª|\.(?=\s|$))"), 445 "pt": re.compile(r"([0-9]+)(º|ª|o|a|os|as)"), 446 "it": re.compile(r"([0-9]+)(º|°|ª|o|a|i|e)"), 447 "pl": re.compile(r"([0-9]+)(º|ª|st|nd|rd|th)"), 448 "ar": re.compile(r"([0-9]+)(ون|ين|ث|ر|ى)"), 449 "cs": re.compile( 450 r"([0-9]+)\.(?=\s|$)" 451 ), # In Czech, a dot is often used after the number to indicate ordinals. 452 "ru": re.compile(r"([0-9]+)(-й|-я|-е|-ое|-ье|-го)"), 453 "nl": re.compile(r"([0-9]+)(de|ste|e)"), 454 "tr": re.compile(r"([0-9]+)(\.|inci|nci|uncu|üncü|\.)"), 455 "hu": re.compile(r"([0-9]+)(\.|adik|edik|odik|edik|ödik|ödike|ik)"), 456 "ko": re.compile(r"([0-9]+)(번째|번|차|째)"), 457} 458_number_re = re.compile(r"[0-9]+") 459_currency_re = { 460 "USD": re.compile(r"((\$[0-9\.\,]*[0-9]+)|([0-9\.\,]*[0-9]+\$))"), 461 "GBP": re.compile(r"((£[0-9\.\,]*[0-9]+)|([0-9\.\,]*[0-9]+£))"), 462 "EUR": re.compile(r"(([0-9\.\,]*[0-9]+€)|((€[0-9\.\,]*[0-9]+)))"), 463} 464 465_comma_number_re = re.compile(r"\b\d{1,3}(,\d{3})*(\.\d+)?\b") 466_dot_number_re = re.compile(r"\b\d{1,3}(.\d{3})*(\,\d+)?\b") 467_decimal_number_re = re.compile(r"([0-9]+[.,][0-9]+)") 468 469 470def _remove_commas(m): 471 text = m.group(0) 472 if "," in text: 473 text = text.replace(",", "") 474 return text 475 476 477def _remove_dots(m): 478 text = m.group(0) 479 if "." in text: 480 text = text.replace(".", "") 481 return text 482 483 484def _expand_decimal_point(m, lang="en"): 485 amount = m.group(1).replace(",", ".") 486 return num2words(float(amount), lang=lang if lang != "cs" else "cz") 487 488 489def _expand_currency(m, lang="en", currency="USD"): 490 amount = float((re.sub(r"[^\d.]", "", m.group(0).replace(",", ".")))) 491 full_amount = num2words( 492 amount, to="currency", currency=currency, lang=lang if lang != "cs" else "cz" 493 ) 494 495 and_equivalents = { 496 "en": ", ", 497 "es": " con ", 498 "fr": " et ", 499 "de": " und ", 500 "pt": " e ", 501 "it": " e ", 502 "pl": ", ", 503 "cs": ", ", 504 "ru": ", ", 505 "nl": ", ", 506 "ar": ", ", 507 "tr": ", ", 508 "hu": ", ", 509 "ko": ", ", 510 } 511 512 if amount.is_integer(): 513 last_and = full_amount.rfind(and_equivalents[lang]) 514 if last_and != -1: 515 full_amount = full_amount[:last_and] 516 517 return full_amount 518 519 520def _expand_ordinal(m, lang="en"): 521 return num2words(int(m.group(1)), ordinal=True, lang=lang if lang != "cs" else "cz") 522 523 524def _expand_number(m, lang="en"): 525 return num2words(int(m.group(0)), lang=lang if lang != "cs" else "cz") 526 527 528def expand_numbers_multilingual(text, lang="en"): 529 if lang == "zh": 530 text = zh_num2words()(text) 531 else: 532 if lang in ["en", "ru"]: 533 text = re.sub(_comma_number_re, _remove_commas, text) 534 else: 535 text = re.sub(_dot_number_re, _remove_dots, text) 536 try: 537 text = re.sub( 538 _currency_re["GBP"], lambda m: _expand_currency(m, lang, "GBP"), text 539 ) 540 text = re.sub( 541 _currency_re["USD"], lambda m: _expand_currency(m, lang, "USD"), text 542 ) 543 text = re.sub( 544 _currency_re["EUR"], lambda m: _expand_currency(m, lang, "EUR"), text 545 ) 546 except: 547 pass 548 if lang != "tr": 549 text = re.sub( 550 _decimal_number_re, lambda m: _expand_decimal_point(m, lang), text 551 ) 552 text = re.sub(_ordinal_re[lang], lambda m: _expand_ordinal(m, lang), text) 553 text = re.sub(_number_re, lambda m: _expand_number(m, lang), text) 554 return text 555 556 557def lowercase(text): 558 return text.lower() 559 560 561def collapse_whitespace(text): 562 return re.sub(_whitespace_re, " ", text) 563 564 565def multilingual_cleaners(text, lang): 566 text = text.replace('"', "") 567 if lang == "tr": 568 text = text.replace("İ", "i") 569 text = text.replace("Ö", "ö") 570 text = text.replace("Ü", "ü") 571 text = lowercase(text) 572 try: 573 text = expand_numbers_multilingual(text, lang) 574 except: 575 pass 576 try: 577 text = expand_abbreviations_multilingual(text, lang) 578 except: 579 pass 580 try: 581 text = expand_symbols_multilingual(text, lang=lang) 582 except: 583 pass 584 text = collapse_whitespace(text) 585 return text 586 587 588def basic_cleaners(text): 589 """Basic pipeline that lowercases and collapses whitespace without transliteration.""" 590 text = lowercase(text) 591 text = collapse_whitespace(text) 592 return text 593 594 595def chinese_transliterate(text): 596 return "".join( 597 [ 598 p[0] 599 for p in pypinyin.pinyin( 600 text, 601 style=pypinyin.Style.TONE3, 602 heteronym=False, 603 neutral_tone_with_five=True, 604 ) 605 ] 606 ) 607 608 609def japanese_cleaners(text, katsu): 610 text = katsu.romaji(text) 611 text = lowercase(text) 612 return text 613 614 615def korean_transliterate(text): 616 r = Transliter(academic) 617 return r.translit(text) 618 619 620DEFAULT_VOCAB_FILE = os.path.join( 621 os.path.dirname(os.path.realpath(__file__)), "vocab.json" 622) 623 624 625class VoiceBpeTokenizer: 626 def __init__(self, vocab_file=DEFAULT_VOCAB_FILE): 627 self.tokenizer = None 628 if vocab_file is not None: 629 self.tokenizer = Tokenizer.from_file(vocab_file) 630 self.char_limits = { 631 "en": 10000, 632 "de": 253, 633 "fr": 273, 634 "es": 239, 635 "it": 213, 636 "pt": 203, 637 "pl": 224, 638 "zh": 82, 639 "ar": 166, 640 "cs": 186, 641 "ru": 182, 642 "nl": 251, 643 "tr": 226, 644 "ja": 71, 645 "hu": 224, 646 "ko": 95, 647 } 648 649 @cached_property 650 def katsu(self): 651 import cutlet 652 653 return cutlet.Cutlet() 654 655 def check_input_length(self, txt, lang): 656 lang = lang.split("-")[0] # remove the region 657 limit = self.char_limits.get(lang, 250) 658 # if len(txt) > limit: 659 # print( 660 # f"[!] Warning: The text length exceeds the character limit of {limit} for language '{lang}', this might cause truncated audio." 661 # ) 662 663 def preprocess_text(self, txt, lang): 664 if lang in { 665 "ar", 666 "cs", 667 "de", 668 "en", 669 "es", 670 "fr", 671 "hu", 672 "it", 673 "nl", 674 "pl", 675 "pt", 676 "ru", 677 "tr", 678 "zh", 679 "ko", 680 }: 681 txt = multilingual_cleaners(txt, lang) 682 if lang == "zh": 683 txt = chinese_transliterate(txt) 684 if lang == "ko": 685 txt = korean_transliterate(txt) 686 elif lang == "ja": 687 txt = japanese_cleaners(txt, self.katsu) 688 elif lang == "hi": 689 # @manmay will implement this 690 txt = basic_cleaners(txt) 691 else: 692 raise NotImplementedError(f"Language '{lang}' is not supported.") 693 return txt 694 695 def encode(self, txt, lang): 696 lang = lang.split("-")[0] # remove the region 697 self.check_input_length(txt, lang) 698 txt = self.preprocess_text(txt, lang) 699 lang = "zh-cn" if lang == "zh" else lang 700 txt = f"[{lang}]{txt}" 701 txt = txt.replace(" ", "[SPACE]") 702 return self.tokenizer.encode(txt).ids 703 704 def decode(self, seq, skip_special_tokens=False): 705 if isinstance(seq, torch.Tensor): 706 seq = seq.cpu().numpy() 707 txt = self.tokenizer.decode(seq, skip_special_tokens=False).replace(" ", "") 708 txt = txt.replace("[SPACE]", " ") 709 txt = txt.replace("[STOP]", "") 710 # txt = txt.replace("[UNK]", "") 711 return txt 712 713 # copy from https://github.com/huggingface/transformers/blob/main/src/transformers/tokenization_utils_base.py#L3936 714 def batch_decode( 715 self, 716 sequences: Union[ 717 List[int], List[List[int]], "np.ndarray", "torch.Tensor", "tf.Tensor" 718 ], 719 skip_special_tokens: bool = False, 720 ) -> List[str]: 721 """ 722 Convert a list of lists of token ids into a list of strings by calling decode. 723 724 Args: 725 sequences (`Union[List[int], List[List[int]], np.ndarray, torch.Tensor, tf.Tensor]`): 726 List of tokenized input ids. Can be obtained using the `__call__` method. 727 skip_special_tokens (`bool`, *optional*, defaults to `False`): 728 Whether or not to remove special tokens in the decoding. 729 kwargs (additional keyword arguments, *optional*): 730 Will be passed to the underlying model specific decode method. 731 732 Returns: 733 `List[str]`: The list of decoded sentences. 734 """ 735 return [self.decode(seq) for seq in sequences] 736 737 # https://github.com/coqui-ai/TTS/blob/dev/TTS/tts/layers/xtts/trainer/dataset.py#L202 738 # def pad(self): 739 740 def __len__(self): 741 return self.tokenizer.get_vocab_size() 742 743 def get_number_tokens(self): 744 return max(self.tokenizer.get_vocab().values()) + 1 745 746 747def test_expand_numbers_multilingual(): 748 test_cases = [ 749 # English 750 ("In 12.5 seconds.", "In twelve point five seconds.", "en"), 751 ("There were 50 soldiers.", "There were fifty soldiers.", "en"), 752 ("This is a 1st test", "This is a first test", "en"), 753 ("That will be $20 sir.", "That will be twenty dollars sir.", "en"), 754 ("That will be 20€ sir.", "That will be twenty euro sir.", "en"), 755 ( 756 "That will be 20.15€ sir.", 757 "That will be twenty euro, fifteen cents sir.", 758 "en", 759 ), 760 ("That's 100,000.5.", "That's one hundred thousand point five.", "en"), 761 # French 762 ("En 12,5 secondes.", "En douze virgule cinq secondes.", "fr"), 763 ("Il y avait 50 soldats.", "Il y avait cinquante soldats.", "fr"), 764 ("Ceci est un 1er test", "Ceci est un premier test", "fr"), 765 ( 766 "Cela vous fera $20 monsieur.", 767 "Cela vous fera vingt dollars monsieur.", 768 "fr", 769 ), 770 ("Cela vous fera 20€ monsieur.", "Cela vous fera vingt euros monsieur.", "fr"), 771 ( 772 "Cela vous fera 20,15€ monsieur.", 773 "Cela vous fera vingt euros et quinze centimes monsieur.", 774 "fr", 775 ), 776 ("Ce sera 100.000,5.", "Ce sera cent mille virgule cinq.", "fr"), 777 # German 778 ("In 12,5 Sekunden.", "In zwölf Komma fünf Sekunden.", "de"), 779 ("Es gab 50 Soldaten.", "Es gab fünfzig Soldaten.", "de"), 780 ("Dies ist ein 1. Test", "Dies ist ein erste Test", "de"), # Issue with gender 781 ("Das macht $20 Herr.", "Das macht zwanzig Dollar Herr.", "de"), 782 ("Das macht 20€ Herr.", "Das macht zwanzig Euro Herr.", "de"), 783 ( 784 "Das macht 20,15€ Herr.", 785 "Das macht zwanzig Euro und fünfzehn Cent Herr.", 786 "de", 787 ), 788 # Spanish 789 ("En 12,5 segundos.", "En doce punto cinco segundos.", "es"), 790 ("Había 50 soldados.", "Había cincuenta soldados.", "es"), 791 ("Este es un 1er test", "Este es un primero test", "es"), 792 ("Eso le costará $20 señor.", "Eso le costará veinte dólares señor.", "es"), 793 ("Eso le costará 20€ señor.", "Eso le costará veinte euros señor.", "es"), 794 ( 795 "Eso le costará 20,15€ señor.", 796 "Eso le costará veinte euros con quince céntimos señor.", 797 "es", 798 ), 799 # Italian 800 ("In 12,5 secondi.", "In dodici virgola cinque secondi.", "it"), 801 ("C'erano 50 soldati.", "C'erano cinquanta soldati.", "it"), 802 ("Questo è un 1° test", "Questo è un primo test", "it"), 803 ("Ti costerà $20 signore.", "Ti costerà venti dollari signore.", "it"), 804 ("Ti costerà 20€ signore.", "Ti costerà venti euro signore.", "it"), 805 ( 806 "Ti costerà 20,15€ signore.", 807 "Ti costerà venti euro e quindici centesimi signore.", 808 "it", 809 ), 810 # Portuguese 811 ("Em 12,5 segundos.", "Em doze vírgula cinco segundos.", "pt"), 812 ("Havia 50 soldados.", "Havia cinquenta soldados.", "pt"), 813 ("Este é um 1º teste", "Este é um primeiro teste", "pt"), 814 ("Isso custará $20 senhor.", "Isso custará vinte dólares senhor.", "pt"), 815 ("Isso custará 20€ senhor.", "Isso custará vinte euros senhor.", "pt"), 816 ( 817 "Isso custará 20,15€ senhor.", 818 "Isso custará vinte euros e quinze cêntimos senhor.", 819 "pt", 820 ), # "cêntimos" should be "centavos" num2words issue 821 # Polish 822 ("W 12,5 sekundy.", "W dwanaście przecinek pięć sekundy.", "pl"), 823 ("Było 50 żołnierzy.", "Było pięćdziesiąt żołnierzy.", "pl"), 824 ( 825 "To będzie kosztować 20€ panie.", 826 "To będzie kosztować dwadzieścia euro panie.", 827 "pl", 828 ), 829 ( 830 "To będzie kosztować 20,15€ panie.", 831 "To będzie kosztować dwadzieścia euro, piętnaście centów panie.", 832 "pl", 833 ), 834 # Arabic 835 ("في الـ 12,5 ثانية.", "في الـ اثنا عشر , خمسون ثانية.", "ar"), 836 ("كان هناك 50 جنديًا.", "كان هناك خمسون جنديًا.", "ar"), 837 # ("ستكون النتيجة $20 يا سيد.", 'ستكون النتيجة عشرون دولار يا سيد.', 'ar'), # $ and € are mising from num2words 838 # ("ستكون النتيجة 20€ يا سيد.", 'ستكون النتيجة عشرون يورو يا سيد.', 'ar'), 839 # Czech 840 ("Za 12,5 vteřiny.", "Za dvanáct celá pět vteřiny.", "cs"), 841 ("Bylo tam 50 vojáků.", "Bylo tam padesát vojáků.", "cs"), 842 ("To bude stát 20€ pane.", "To bude stát dvacet euro pane.", "cs"), 843 ("To bude 20.15€ pane.", "To bude dvacet euro, patnáct centů pane.", "cs"), 844 # Russian 845 ("Через 12.5 секунды.", "Через двенадцать запятая пять секунды.", "ru"), 846 ("Там было 50 солдат.", "Там было пятьдесят солдат.", "ru"), 847 ( 848 "Это будет 20.15€ сэр.", 849 "Это будет двадцать евро, пятнадцать центов сэр.", 850 "ru", 851 ), 852 ( 853 "Это будет стоить 20€ господин.", 854 "Это будет стоить двадцать евро господин.", 855 "ru", 856 ), 857 # Dutch 858 ("In 12,5 seconden.", "In twaalf komma vijf seconden.", "nl"), 859 ("Er waren 50 soldaten.", "Er waren vijftig soldaten.", "nl"), 860 ("Dat wordt dan $20 meneer.", "Dat wordt dan twintig dollar meneer.", "nl"), 861 ("Dat wordt dan 20€ meneer.", "Dat wordt dan twintig euro meneer.", "nl"), 862 # Chinese (Simplified) 863 ("在12.5秒内", "在十二点五秒内", "zh"), 864 ("有50名士兵", "有五十名士兵", "zh"), 865 # ("那将是$20先生", '那将是二十美元先生', 'zh'), currency doesn't work 866 # ("那将是20€先生", '那将是二十欧元先生', 'zh'), 867 # Turkish 868 # ("12,5 saniye içinde.", 'On iki virgül beş saniye içinde.', 'tr'), # decimal doesn't work for TR 869 ("50 asker vardı.", "elli asker vardı.", "tr"), 870 ("Bu 1. test", "Bu birinci test", "tr"), 871 # ("Bu 100.000,5.", 'Bu yüz bin virgül beş.', 'tr'), 872 # Hungarian 873 ("12,5 másodperc alatt.", "tizenkettő egész öt tized másodperc alatt.", "hu"), 874 ("50 katona volt.", "ötven katona volt.", "hu"), 875 ("Ez az 1. teszt", "Ez az első teszt", "hu"), 876 # Korean 877 ("12.5 초 안에.", "십이 점 다섯 초 안에.", "ko"), 878 ("50 명의 병사가 있었다.", "오십 명의 병사가 있었다.", "ko"), 879 ("이것은 1 번째 테스트입니다", "이것은 첫 번째 테스트입니다", "ko"), 880 ] 881 for a, b, lang in test_cases: 882 out = expand_numbers_multilingual(a, lang=lang) 883 assert out == b, f"'{out}' vs '{b}'" 884 885 886def test_abbreviations_multilingual(): 887 test_cases = [ 888 # English 889 ("Hello Mr. Smith.", "Hello mister Smith.", "en"), 890 ("Dr. Jones is here.", "doctor Jones is here.", "en"), 891 # Spanish 892 ("Hola Sr. Garcia.", "Hola señor Garcia.", "es"), 893 ("La Dra. Martinez es muy buena.", "La doctora Martinez es muy buena.", "es"), 894 # French 895 ("Bonjour Mr. Dupond.", "Bonjour monsieur Dupond.", "fr"), 896 ( 897 "Mme. Moreau est absente aujourd'hui.", 898 "madame Moreau est absente aujourd'hui.", 899 "fr", 900 ), 901 # German 902 ("Frau Dr. Müller ist sehr klug.", "Frau doktor Müller ist sehr klug.", "de"), 903 # Portuguese 904 ("Olá Sr. Silva.", "Olá senhor Silva.", "pt"), 905 ( 906 "Dra. Costa, você está disponível?", 907 "doutora Costa, você está disponível?", 908 "pt", 909 ), 910 # Italian 911 ("Buongiorno, Sig. Rossi.", "Buongiorno, signore Rossi.", "it"), 912 # ("Sig.ra Bianchi, posso aiutarti?", 'signora Bianchi, posso aiutarti?', 'it'), # Issue with matching that pattern 913 # Polish 914 ("Dzień dobry, P. Kowalski.", "Dzień dobry, pani Kowalski.", "pl"), 915 ( 916 "M. Nowak, czy mogę zadać pytanie?", 917 "pan Nowak, czy mogę zadać pytanie?", 918 "pl", 919 ), 920 # Czech 921 ("P. Novák", "pan Novák", "cs"), 922 ("Dr. Vojtěch", "doktor Vojtěch", "cs"), 923 # Dutch 924 ("Dhr. Jansen", "de heer Jansen", "nl"), 925 ("Mevr. de Vries", "mevrouw de Vries", "nl"), 926 # Russian 927 ("Здравствуйте Г-н Иванов.", "Здравствуйте господин Иванов.", "ru"), 928 ( 929 "Д-р Смирнов здесь, чтобы увидеть вас.", 930 "доктор Смирнов здесь, чтобы увидеть вас.", 931 "ru", 932 ), 933 # Turkish 934 ("Merhaba B. Yılmaz.", "Merhaba bay Yılmaz.", "tr"), 935 ("Dr. Ayşe burada.", "doktor Ayşe burada.", "tr"), 936 # Hungarian 937 ("Dr. Szabó itt van.", "doktor Szabó itt van.", "hu"), 938 ] 939 940 for a, b, lang in test_cases: 941 out = expand_abbreviations_multilingual(a, lang=lang) 942 assert out == b, f"'{out}' vs '{b}'" 943 944 945def test_symbols_multilingual(): 946 test_cases = [ 947 ("I have 14% battery", "I have 14 percent battery", "en"), 948 ("Te veo @ la fiesta", "Te veo arroba la fiesta", "es"), 949 ("J'ai 14° de fièvre", "J'ai 14 degrés de fièvre", "fr"), 950 ("Die Rechnung beträgt £ 20", "Die Rechnung beträgt pfund 20", "de"), 951 ( 952 "O meu email é ana&joao@gmail.com", 953 "O meu email é ana e joao arroba gmail.com", 954 "pt", 955 ), 956 ( 957 "linguaggio di programmazione C#", 958 "linguaggio di programmazione C cancelletto", 959 "it", 960 ), 961 ("Moja temperatura to 36.6°", "Moja temperatura to 36.6 stopnie", "pl"), 962 ("Mám 14% baterie", "Mám 14 procento baterie", "cs"), 963 ("Těším se na tebe @ party", "Těším se na tebe na party", "cs"), 964 ("У меня 14% заряда", "У меня 14 процентов заряда", "ru"), 965 ("Я буду @ дома", "Я буду собака дома", "ru"), 966 ("Ik heb 14% batterij", "Ik heb 14 procent batterij", "nl"), 967 ("Ik zie je @ het feest", "Ik zie je bij het feest", "nl"), 968 ("لدي 14% في البطارية", "لدي 14 في المئة في البطارية", "ar"), 969 ("我的电量为 14%", "我的电量为 14 百分之", "zh"), 970 ("Pilim %14 dolu.", "Pilim yüzde 14 dolu.", "tr"), 971 ( 972 "Az akkumulátorom töltöttsége 14%", 973 "Az akkumulátorom töltöttsége 14 százalék", 974 "hu", 975 ), 976 ("배터리 잔량이 14%입니다.", "배터리 잔량이 14 퍼센트입니다.", "ko"), 977 ] 978 979 for a, b, lang in test_cases: 980 out = expand_symbols_multilingual(a, lang=lang) 981 assert out == b, f"'{out}' vs '{b}'" 982 983 984if __name__ == "__main__": 985 test_expand_numbers_multilingual() 986 test_abbreviations_multilingual() 987 test_symbols_multilingual()
def
get_spacy_lang(lang):
def
split_sentence(text, lang, text_split_length=250):
38def split_sentence(text, lang, text_split_length=250): 39 """Preprocess the input text""" 40 text_splits = [] 41 if text_split_length is not None and len(text) >= text_split_length: 42 text_splits.append("") 43 nlp = get_spacy_lang(lang) 44 nlp.add_pipe("sentencizer") 45 doc = nlp(text) 46 for sentence in doc.sents: 47 if len(text_splits[-1]) + len(str(sentence)) <= text_split_length: 48 # if the last sentence + the current sentence is less than the text_split_length 49 # then add the current sentence to the last sentence 50 text_splits[-1] += " " + str(sentence) 51 text_splits[-1] = text_splits[-1].lstrip() 52 elif len(str(sentence)) > text_split_length: 53 # if the current sentence is greater than the text_split_length 54 for line in textwrap.wrap( 55 str(sentence), 56 width=text_split_length, 57 drop_whitespace=True, 58 break_on_hyphens=False, 59 tabsize=1, 60 ): 61 text_splits.append(str(line)) 62 else: 63 text_splits.append(str(sentence)) 64 65 if len(text_splits) > 1: 66 if text_splits[0] == "": 67 del text_splits[0] 68 else: 69 text_splits = [text.lstrip()] 70 71 return text_splits
Preprocess the input text
def
expand_abbreviations_multilingual(text, lang='en'):
def
expand_symbols_multilingual(text, lang='en'):
def
expand_numbers_multilingual(text, lang='en'):
529def expand_numbers_multilingual(text, lang="en"): 530 if lang == "zh": 531 text = zh_num2words()(text) 532 else: 533 if lang in ["en", "ru"]: 534 text = re.sub(_comma_number_re, _remove_commas, text) 535 else: 536 text = re.sub(_dot_number_re, _remove_dots, text) 537 try: 538 text = re.sub( 539 _currency_re["GBP"], lambda m: _expand_currency(m, lang, "GBP"), text 540 ) 541 text = re.sub( 542 _currency_re["USD"], lambda m: _expand_currency(m, lang, "USD"), text 543 ) 544 text = re.sub( 545 _currency_re["EUR"], lambda m: _expand_currency(m, lang, "EUR"), text 546 ) 547 except: 548 pass 549 if lang != "tr": 550 text = re.sub( 551 _decimal_number_re, lambda m: _expand_decimal_point(m, lang), text 552 ) 553 text = re.sub(_ordinal_re[lang], lambda m: _expand_ordinal(m, lang), text) 554 text = re.sub(_number_re, lambda m: _expand_number(m, lang), text) 555 return text
def
lowercase(text):
def
collapse_whitespace(text):
def
multilingual_cleaners(text, lang):
566def multilingual_cleaners(text, lang): 567 text = text.replace('"', "") 568 if lang == "tr": 569 text = text.replace("İ", "i") 570 text = text.replace("Ö", "ö") 571 text = text.replace("Ü", "ü") 572 text = lowercase(text) 573 try: 574 text = expand_numbers_multilingual(text, lang) 575 except: 576 pass 577 try: 578 text = expand_abbreviations_multilingual(text, lang) 579 except: 580 pass 581 try: 582 text = expand_symbols_multilingual(text, lang=lang) 583 except: 584 pass 585 text = collapse_whitespace(text) 586 return text
def
basic_cleaners(text):
589def basic_cleaners(text): 590 """Basic pipeline that lowercases and collapses whitespace without transliteration.""" 591 text = lowercase(text) 592 text = collapse_whitespace(text) 593 return text
Basic pipeline that lowercases and collapses whitespace without transliteration.
def
chinese_transliterate(text):
def
japanese_cleaners(text, katsu):
def
korean_transliterate(text):
DEFAULT_VOCAB_FILE =
'/Users/e6d64/Documents/GitHub/darkshapes/divisor/divisor/acestep/models/lyrics_utils/vocab.json'
class
VoiceBpeTokenizer:
626class VoiceBpeTokenizer: 627 def __init__(self, vocab_file=DEFAULT_VOCAB_FILE): 628 self.tokenizer = None 629 if vocab_file is not None: 630 self.tokenizer = Tokenizer.from_file(vocab_file) 631 self.char_limits = { 632 "en": 10000, 633 "de": 253, 634 "fr": 273, 635 "es": 239, 636 "it": 213, 637 "pt": 203, 638 "pl": 224, 639 "zh": 82, 640 "ar": 166, 641 "cs": 186, 642 "ru": 182, 643 "nl": 251, 644 "tr": 226, 645 "ja": 71, 646 "hu": 224, 647 "ko": 95, 648 } 649 650 @cached_property 651 def katsu(self): 652 import cutlet 653 654 return cutlet.Cutlet() 655 656 def check_input_length(self, txt, lang): 657 lang = lang.split("-")[0] # remove the region 658 limit = self.char_limits.get(lang, 250) 659 # if len(txt) > limit: 660 # print( 661 # f"[!] Warning: The text length exceeds the character limit of {limit} for language '{lang}', this might cause truncated audio." 662 # ) 663 664 def preprocess_text(self, txt, lang): 665 if lang in { 666 "ar", 667 "cs", 668 "de", 669 "en", 670 "es", 671 "fr", 672 "hu", 673 "it", 674 "nl", 675 "pl", 676 "pt", 677 "ru", 678 "tr", 679 "zh", 680 "ko", 681 }: 682 txt = multilingual_cleaners(txt, lang) 683 if lang == "zh": 684 txt = chinese_transliterate(txt) 685 if lang == "ko": 686 txt = korean_transliterate(txt) 687 elif lang == "ja": 688 txt = japanese_cleaners(txt, self.katsu) 689 elif lang == "hi": 690 # @manmay will implement this 691 txt = basic_cleaners(txt) 692 else: 693 raise NotImplementedError(f"Language '{lang}' is not supported.") 694 return txt 695 696 def encode(self, txt, lang): 697 lang = lang.split("-")[0] # remove the region 698 self.check_input_length(txt, lang) 699 txt = self.preprocess_text(txt, lang) 700 lang = "zh-cn" if lang == "zh" else lang 701 txt = f"[{lang}]{txt}" 702 txt = txt.replace(" ", "[SPACE]") 703 return self.tokenizer.encode(txt).ids 704 705 def decode(self, seq, skip_special_tokens=False): 706 if isinstance(seq, torch.Tensor): 707 seq = seq.cpu().numpy() 708 txt = self.tokenizer.decode(seq, skip_special_tokens=False).replace(" ", "") 709 txt = txt.replace("[SPACE]", " ") 710 txt = txt.replace("[STOP]", "") 711 # txt = txt.replace("[UNK]", "") 712 return txt 713 714 # copy from https://github.com/huggingface/transformers/blob/main/src/transformers/tokenization_utils_base.py#L3936 715 def batch_decode( 716 self, 717 sequences: Union[ 718 List[int], List[List[int]], "np.ndarray", "torch.Tensor", "tf.Tensor" 719 ], 720 skip_special_tokens: bool = False, 721 ) -> List[str]: 722 """ 723 Convert a list of lists of token ids into a list of strings by calling decode. 724 725 Args: 726 sequences (`Union[List[int], List[List[int]], np.ndarray, torch.Tensor, tf.Tensor]`): 727 List of tokenized input ids. Can be obtained using the `__call__` method. 728 skip_special_tokens (`bool`, *optional*, defaults to `False`): 729 Whether or not to remove special tokens in the decoding. 730 kwargs (additional keyword arguments, *optional*): 731 Will be passed to the underlying model specific decode method. 732 733 Returns: 734 `List[str]`: The list of decoded sentences. 735 """ 736 return [self.decode(seq) for seq in sequences] 737 738 # https://github.com/coqui-ai/TTS/blob/dev/TTS/tts/layers/xtts/trainer/dataset.py#L202 739 # def pad(self): 740 741 def __len__(self): 742 return self.tokenizer.get_vocab_size() 743 744 def get_number_tokens(self): 745 return max(self.tokenizer.get_vocab().values()) + 1
VoiceBpeTokenizer( vocab_file='/Users/e6d64/Documents/GitHub/darkshapes/divisor/divisor/acestep/models/lyrics_utils/vocab.json')
627 def __init__(self, vocab_file=DEFAULT_VOCAB_FILE): 628 self.tokenizer = None 629 if vocab_file is not None: 630 self.tokenizer = Tokenizer.from_file(vocab_file) 631 self.char_limits = { 632 "en": 10000, 633 "de": 253, 634 "fr": 273, 635 "es": 239, 636 "it": 213, 637 "pt": 203, 638 "pl": 224, 639 "zh": 82, 640 "ar": 166, 641 "cs": 186, 642 "ru": 182, 643 "nl": 251, 644 "tr": 226, 645 "ja": 71, 646 "hu": 224, 647 "ko": 95, 648 }
def
check_input_length(self, txt, lang):
656 def check_input_length(self, txt, lang): 657 lang = lang.split("-")[0] # remove the region 658 limit = self.char_limits.get(lang, 250) 659 # if len(txt) > limit: 660 # print( 661 # f"[!] Warning: The text length exceeds the character limit of {limit} for language '{lang}', this might cause truncated audio." 662 # )
def
preprocess_text(self, txt, lang):
664 def preprocess_text(self, txt, lang): 665 if lang in { 666 "ar", 667 "cs", 668 "de", 669 "en", 670 "es", 671 "fr", 672 "hu", 673 "it", 674 "nl", 675 "pl", 676 "pt", 677 "ru", 678 "tr", 679 "zh", 680 "ko", 681 }: 682 txt = multilingual_cleaners(txt, lang) 683 if lang == "zh": 684 txt = chinese_transliterate(txt) 685 if lang == "ko": 686 txt = korean_transliterate(txt) 687 elif lang == "ja": 688 txt = japanese_cleaners(txt, self.katsu) 689 elif lang == "hi": 690 # @manmay will implement this 691 txt = basic_cleaners(txt) 692 else: 693 raise NotImplementedError(f"Language '{lang}' is not supported.") 694 return txt
def
encode(self, txt, lang):
696 def encode(self, txt, lang): 697 lang = lang.split("-")[0] # remove the region 698 self.check_input_length(txt, lang) 699 txt = self.preprocess_text(txt, lang) 700 lang = "zh-cn" if lang == "zh" else lang 701 txt = f"[{lang}]{txt}" 702 txt = txt.replace(" ", "[SPACE]") 703 return self.tokenizer.encode(txt).ids
def
decode(self, seq, skip_special_tokens=False):
705 def decode(self, seq, skip_special_tokens=False): 706 if isinstance(seq, torch.Tensor): 707 seq = seq.cpu().numpy() 708 txt = self.tokenizer.decode(seq, skip_special_tokens=False).replace(" ", "") 709 txt = txt.replace("[SPACE]", " ") 710 txt = txt.replace("[STOP]", "") 711 # txt = txt.replace("[UNK]", "") 712 return txt
def
batch_decode( self, sequences: Union[List[int], List[List[int]], ForwardRef('np.ndarray'), ForwardRef('torch.Tensor'), ForwardRef('tf.Tensor')], skip_special_tokens: bool = False) -> List[str]:
715 def batch_decode( 716 self, 717 sequences: Union[ 718 List[int], List[List[int]], "np.ndarray", "torch.Tensor", "tf.Tensor" 719 ], 720 skip_special_tokens: bool = False, 721 ) -> List[str]: 722 """ 723 Convert a list of lists of token ids into a list of strings by calling decode. 724 725 Args: 726 sequences (`Union[List[int], List[List[int]], np.ndarray, torch.Tensor, tf.Tensor]`): 727 List of tokenized input ids. Can be obtained using the `__call__` method. 728 skip_special_tokens (`bool`, *optional*, defaults to `False`): 729 Whether or not to remove special tokens in the decoding. 730 kwargs (additional keyword arguments, *optional*): 731 Will be passed to the underlying model specific decode method. 732 733 Returns: 734 `List[str]`: The list of decoded sentences. 735 """ 736 return [self.decode(seq) for seq in sequences]
Convert a list of lists of token ids into a list of strings by calling decode.
Args:
sequences (Union[List[int], List[List[int]], np.ndarray, torch.Tensor, tf.Tensor]):
List of tokenized input ids. Can be obtained using the __call__ method.
skip_special_tokens (bool, optional, defaults to False):
Whether or not to remove special tokens in the decoding.
kwargs (additional keyword arguments, optional):
Will be passed to the underlying model specific decode method.
Returns:
List[str]: The list of decoded sentences.
def
test_expand_numbers_multilingual():
748def test_expand_numbers_multilingual(): 749 test_cases = [ 750 # English 751 ("In 12.5 seconds.", "In twelve point five seconds.", "en"), 752 ("There were 50 soldiers.", "There were fifty soldiers.", "en"), 753 ("This is a 1st test", "This is a first test", "en"), 754 ("That will be $20 sir.", "That will be twenty dollars sir.", "en"), 755 ("That will be 20€ sir.", "That will be twenty euro sir.", "en"), 756 ( 757 "That will be 20.15€ sir.", 758 "That will be twenty euro, fifteen cents sir.", 759 "en", 760 ), 761 ("That's 100,000.5.", "That's one hundred thousand point five.", "en"), 762 # French 763 ("En 12,5 secondes.", "En douze virgule cinq secondes.", "fr"), 764 ("Il y avait 50 soldats.", "Il y avait cinquante soldats.", "fr"), 765 ("Ceci est un 1er test", "Ceci est un premier test", "fr"), 766 ( 767 "Cela vous fera $20 monsieur.", 768 "Cela vous fera vingt dollars monsieur.", 769 "fr", 770 ), 771 ("Cela vous fera 20€ monsieur.", "Cela vous fera vingt euros monsieur.", "fr"), 772 ( 773 "Cela vous fera 20,15€ monsieur.", 774 "Cela vous fera vingt euros et quinze centimes monsieur.", 775 "fr", 776 ), 777 ("Ce sera 100.000,5.", "Ce sera cent mille virgule cinq.", "fr"), 778 # German 779 ("In 12,5 Sekunden.", "In zwölf Komma fünf Sekunden.", "de"), 780 ("Es gab 50 Soldaten.", "Es gab fünfzig Soldaten.", "de"), 781 ("Dies ist ein 1. Test", "Dies ist ein erste Test", "de"), # Issue with gender 782 ("Das macht $20 Herr.", "Das macht zwanzig Dollar Herr.", "de"), 783 ("Das macht 20€ Herr.", "Das macht zwanzig Euro Herr.", "de"), 784 ( 785 "Das macht 20,15€ Herr.", 786 "Das macht zwanzig Euro und fünfzehn Cent Herr.", 787 "de", 788 ), 789 # Spanish 790 ("En 12,5 segundos.", "En doce punto cinco segundos.", "es"), 791 ("Había 50 soldados.", "Había cincuenta soldados.", "es"), 792 ("Este es un 1er test", "Este es un primero test", "es"), 793 ("Eso le costará $20 señor.", "Eso le costará veinte dólares señor.", "es"), 794 ("Eso le costará 20€ señor.", "Eso le costará veinte euros señor.", "es"), 795 ( 796 "Eso le costará 20,15€ señor.", 797 "Eso le costará veinte euros con quince céntimos señor.", 798 "es", 799 ), 800 # Italian 801 ("In 12,5 secondi.", "In dodici virgola cinque secondi.", "it"), 802 ("C'erano 50 soldati.", "C'erano cinquanta soldati.", "it"), 803 ("Questo è un 1° test", "Questo è un primo test", "it"), 804 ("Ti costerà $20 signore.", "Ti costerà venti dollari signore.", "it"), 805 ("Ti costerà 20€ signore.", "Ti costerà venti euro signore.", "it"), 806 ( 807 "Ti costerà 20,15€ signore.", 808 "Ti costerà venti euro e quindici centesimi signore.", 809 "it", 810 ), 811 # Portuguese 812 ("Em 12,5 segundos.", "Em doze vírgula cinco segundos.", "pt"), 813 ("Havia 50 soldados.", "Havia cinquenta soldados.", "pt"), 814 ("Este é um 1º teste", "Este é um primeiro teste", "pt"), 815 ("Isso custará $20 senhor.", "Isso custará vinte dólares senhor.", "pt"), 816 ("Isso custará 20€ senhor.", "Isso custará vinte euros senhor.", "pt"), 817 ( 818 "Isso custará 20,15€ senhor.", 819 "Isso custará vinte euros e quinze cêntimos senhor.", 820 "pt", 821 ), # "cêntimos" should be "centavos" num2words issue 822 # Polish 823 ("W 12,5 sekundy.", "W dwanaście przecinek pięć sekundy.", "pl"), 824 ("Było 50 żołnierzy.", "Było pięćdziesiąt żołnierzy.", "pl"), 825 ( 826 "To będzie kosztować 20€ panie.", 827 "To będzie kosztować dwadzieścia euro panie.", 828 "pl", 829 ), 830 ( 831 "To będzie kosztować 20,15€ panie.", 832 "To będzie kosztować dwadzieścia euro, piętnaście centów panie.", 833 "pl", 834 ), 835 # Arabic 836 ("في الـ 12,5 ثانية.", "في الـ اثنا عشر , خمسون ثانية.", "ar"), 837 ("كان هناك 50 جنديًا.", "كان هناك خمسون جنديًا.", "ar"), 838 # ("ستكون النتيجة $20 يا سيد.", 'ستكون النتيجة عشرون دولار يا سيد.', 'ar'), # $ and € are mising from num2words 839 # ("ستكون النتيجة 20€ يا سيد.", 'ستكون النتيجة عشرون يورو يا سيد.', 'ar'), 840 # Czech 841 ("Za 12,5 vteřiny.", "Za dvanáct celá pět vteřiny.", "cs"), 842 ("Bylo tam 50 vojáků.", "Bylo tam padesát vojáků.", "cs"), 843 ("To bude stát 20€ pane.", "To bude stát dvacet euro pane.", "cs"), 844 ("To bude 20.15€ pane.", "To bude dvacet euro, patnáct centů pane.", "cs"), 845 # Russian 846 ("Через 12.5 секунды.", "Через двенадцать запятая пять секунды.", "ru"), 847 ("Там было 50 солдат.", "Там было пятьдесят солдат.", "ru"), 848 ( 849 "Это будет 20.15€ сэр.", 850 "Это будет двадцать евро, пятнадцать центов сэр.", 851 "ru", 852 ), 853 ( 854 "Это будет стоить 20€ господин.", 855 "Это будет стоить двадцать евро господин.", 856 "ru", 857 ), 858 # Dutch 859 ("In 12,5 seconden.", "In twaalf komma vijf seconden.", "nl"), 860 ("Er waren 50 soldaten.", "Er waren vijftig soldaten.", "nl"), 861 ("Dat wordt dan $20 meneer.", "Dat wordt dan twintig dollar meneer.", "nl"), 862 ("Dat wordt dan 20€ meneer.", "Dat wordt dan twintig euro meneer.", "nl"), 863 # Chinese (Simplified) 864 ("在12.5秒内", "在十二点五秒内", "zh"), 865 ("有50名士兵", "有五十名士兵", "zh"), 866 # ("那将是$20先生", '那将是二十美元先生', 'zh'), currency doesn't work 867 # ("那将是20€先生", '那将是二十欧元先生', 'zh'), 868 # Turkish 869 # ("12,5 saniye içinde.", 'On iki virgül beş saniye içinde.', 'tr'), # decimal doesn't work for TR 870 ("50 asker vardı.", "elli asker vardı.", "tr"), 871 ("Bu 1. test", "Bu birinci test", "tr"), 872 # ("Bu 100.000,5.", 'Bu yüz bin virgül beş.', 'tr'), 873 # Hungarian 874 ("12,5 másodperc alatt.", "tizenkettő egész öt tized másodperc alatt.", "hu"), 875 ("50 katona volt.", "ötven katona volt.", "hu"), 876 ("Ez az 1. teszt", "Ez az első teszt", "hu"), 877 # Korean 878 ("12.5 초 안에.", "십이 점 다섯 초 안에.", "ko"), 879 ("50 명의 병사가 있었다.", "오십 명의 병사가 있었다.", "ko"), 880 ("이것은 1 번째 테스트입니다", "이것은 첫 번째 테스트입니다", "ko"), 881 ] 882 for a, b, lang in test_cases: 883 out = expand_numbers_multilingual(a, lang=lang) 884 assert out == b, f"'{out}' vs '{b}'"
def
test_abbreviations_multilingual():
887def test_abbreviations_multilingual(): 888 test_cases = [ 889 # English 890 ("Hello Mr. Smith.", "Hello mister Smith.", "en"), 891 ("Dr. Jones is here.", "doctor Jones is here.", "en"), 892 # Spanish 893 ("Hola Sr. Garcia.", "Hola señor Garcia.", "es"), 894 ("La Dra. Martinez es muy buena.", "La doctora Martinez es muy buena.", "es"), 895 # French 896 ("Bonjour Mr. Dupond.", "Bonjour monsieur Dupond.", "fr"), 897 ( 898 "Mme. Moreau est absente aujourd'hui.", 899 "madame Moreau est absente aujourd'hui.", 900 "fr", 901 ), 902 # German 903 ("Frau Dr. Müller ist sehr klug.", "Frau doktor Müller ist sehr klug.", "de"), 904 # Portuguese 905 ("Olá Sr. Silva.", "Olá senhor Silva.", "pt"), 906 ( 907 "Dra. Costa, você está disponível?", 908 "doutora Costa, você está disponível?", 909 "pt", 910 ), 911 # Italian 912 ("Buongiorno, Sig. Rossi.", "Buongiorno, signore Rossi.", "it"), 913 # ("Sig.ra Bianchi, posso aiutarti?", 'signora Bianchi, posso aiutarti?', 'it'), # Issue with matching that pattern 914 # Polish 915 ("Dzień dobry, P. Kowalski.", "Dzień dobry, pani Kowalski.", "pl"), 916 ( 917 "M. Nowak, czy mogę zadać pytanie?", 918 "pan Nowak, czy mogę zadać pytanie?", 919 "pl", 920 ), 921 # Czech 922 ("P. Novák", "pan Novák", "cs"), 923 ("Dr. Vojtěch", "doktor Vojtěch", "cs"), 924 # Dutch 925 ("Dhr. Jansen", "de heer Jansen", "nl"), 926 ("Mevr. de Vries", "mevrouw de Vries", "nl"), 927 # Russian 928 ("Здравствуйте Г-н Иванов.", "Здравствуйте господин Иванов.", "ru"), 929 ( 930 "Д-р Смирнов здесь, чтобы увидеть вас.", 931 "доктор Смирнов здесь, чтобы увидеть вас.", 932 "ru", 933 ), 934 # Turkish 935 ("Merhaba B. Yılmaz.", "Merhaba bay Yılmaz.", "tr"), 936 ("Dr. Ayşe burada.", "doktor Ayşe burada.", "tr"), 937 # Hungarian 938 ("Dr. Szabó itt van.", "doktor Szabó itt van.", "hu"), 939 ] 940 941 for a, b, lang in test_cases: 942 out = expand_abbreviations_multilingual(a, lang=lang) 943 assert out == b, f"'{out}' vs '{b}'"
def
test_symbols_multilingual():
946def test_symbols_multilingual(): 947 test_cases = [ 948 ("I have 14% battery", "I have 14 percent battery", "en"), 949 ("Te veo @ la fiesta", "Te veo arroba la fiesta", "es"), 950 ("J'ai 14° de fièvre", "J'ai 14 degrés de fièvre", "fr"), 951 ("Die Rechnung beträgt £ 20", "Die Rechnung beträgt pfund 20", "de"), 952 ( 953 "O meu email é ana&joao@gmail.com", 954 "O meu email é ana e joao arroba gmail.com", 955 "pt", 956 ), 957 ( 958 "linguaggio di programmazione C#", 959 "linguaggio di programmazione C cancelletto", 960 "it", 961 ), 962 ("Moja temperatura to 36.6°", "Moja temperatura to 36.6 stopnie", "pl"), 963 ("Mám 14% baterie", "Mám 14 procento baterie", "cs"), 964 ("Těším se na tebe @ party", "Těším se na tebe na party", "cs"), 965 ("У меня 14% заряда", "У меня 14 процентов заряда", "ru"), 966 ("Я буду @ дома", "Я буду собака дома", "ru"), 967 ("Ik heb 14% batterij", "Ik heb 14 procent batterij", "nl"), 968 ("Ik zie je @ het feest", "Ik zie je bij het feest", "nl"), 969 ("لدي 14% في البطارية", "لدي 14 في المئة في البطارية", "ar"), 970 ("我的电量为 14%", "我的电量为 14 百分之", "zh"), 971 ("Pilim %14 dolu.", "Pilim yüzde 14 dolu.", "tr"), 972 ( 973 "Az akkumulátorom töltöttsége 14%", 974 "Az akkumulátorom töltöttsége 14 százalék", 975 "hu", 976 ), 977 ("배터리 잔량이 14%입니다.", "배터리 잔량이 14 퍼센트입니다.", "ko"), 978 ] 979 980 for a, b, lang in test_cases: 981 out = expand_symbols_multilingual(a, lang=lang) 982 assert out == b, f"'{out}' vs '{b}'"