divisor.acestep.models.lyrics_utils.lyric_tokenizer

View Source

  1import os
  2import re
  3import textwrap
  4from functools import cached_property
  5
  6import pypinyin
  7import torch
  8from hangul_romanize import Transliter
  9from hangul_romanize.rule import academic
 10from num2words import num2words
 11from spacy.lang.ar import Arabic
 12from spacy.lang.en import English
 13from spacy.lang.es import Spanish
 14from spacy.lang.ja import Japanese
 15from spacy.lang.zh import Chinese
 16from tokenizers import Tokenizer
 17
 18from .zh_num2words import TextNorm as zh_num2words
 19from typing import Dict, List, Optional, Set, Union
 20
 21
 22# copy from https://github.com/coqui-ai/TTS/blob/dbf1a08a0d4e47fdad6172e433eeb34bc6b13b4e/TTS/tts/layers/xtts/tokenizer.py
 23def get_spacy_lang(lang):
 24    if lang == "zh":
 25        return Chinese()
 26    elif lang == "ja":
 27        return Japanese()
 28    elif lang == "ar":
 29        return Arabic()
 30    elif lang == "es":
 31        return Spanish()
 32    else:
 33        # For most languages, Enlish does the job
 34        return English()
 35
 36
 37def split_sentence(text, lang, text_split_length=250):
 38    """Preprocess the input text"""
 39    text_splits = []
 40    if text_split_length is not None and len(text) >= text_split_length:
 41        text_splits.append("")
 42        nlp = get_spacy_lang(lang)
 43        nlp.add_pipe("sentencizer")
 44        doc = nlp(text)
 45        for sentence in doc.sents:
 46            if len(text_splits[-1]) + len(str(sentence)) <= text_split_length:
 47                # if the last sentence + the current sentence is less than the text_split_length
 48                # then add the current sentence to the last sentence
 49                text_splits[-1] += " " + str(sentence)
 50                text_splits[-1] = text_splits[-1].lstrip()
 51            elif len(str(sentence)) > text_split_length:
 52                # if the current sentence is greater than the text_split_length
 53                for line in textwrap.wrap(
 54                    str(sentence),
 55                    width=text_split_length,
 56                    drop_whitespace=True,
 57                    break_on_hyphens=False,
 58                    tabsize=1,
 59                ):
 60                    text_splits.append(str(line))
 61            else:
 62                text_splits.append(str(sentence))
 63
 64        if len(text_splits) > 1:
 65            if text_splits[0] == "":
 66                del text_splits[0]
 67    else:
 68        text_splits = [text.lstrip()]
 69
 70    return text_splits
 71
 72
 73_whitespace_re = re.compile(r"\s+")
 74
 75# List of (regular expression, replacement) pairs for abbreviations:
 76_abbreviations = {
 77    "en": [
 78        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
 79        for x in [
 80            ("mrs", "misess"),
 81            ("mr", "mister"),
 82            ("dr", "doctor"),
 83            ("st", "saint"),
 84            ("co", "company"),
 85            ("jr", "junior"),
 86            ("maj", "major"),
 87            ("gen", "general"),
 88            ("drs", "doctors"),
 89            ("rev", "reverend"),
 90            ("lt", "lieutenant"),
 91            ("hon", "honorable"),
 92            ("sgt", "sergeant"),
 93            ("capt", "captain"),
 94            ("esq", "esquire"),
 95            ("ltd", "limited"),
 96            ("col", "colonel"),
 97            ("ft", "fort"),
 98        ]
 99    ],
100    "es": [
101        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
102        for x in [
103            ("sra", "señora"),
104            ("sr", "señor"),
105            ("dr", "doctor"),
106            ("dra", "doctora"),
107            ("st", "santo"),
108            ("co", "compañía"),
109            ("jr", "junior"),
110            ("ltd", "limitada"),
111        ]
112    ],
113    "fr": [
114        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
115        for x in [
116            ("mme", "madame"),
117            ("mr", "monsieur"),
118            ("dr", "docteur"),
119            ("st", "saint"),
120            ("co", "compagnie"),
121            ("jr", "junior"),
122            ("ltd", "limitée"),
123        ]
124    ],
125    "de": [
126        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
127        for x in [
128            ("fr", "frau"),
129            ("dr", "doktor"),
130            ("st", "sankt"),
131            ("co", "firma"),
132            ("jr", "junior"),
133        ]
134    ],
135    "pt": [
136        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
137        for x in [
138            ("sra", "senhora"),
139            ("sr", "senhor"),
140            ("dr", "doutor"),
141            ("dra", "doutora"),
142            ("st", "santo"),
143            ("co", "companhia"),
144            ("jr", "júnior"),
145            ("ltd", "limitada"),
146        ]
147    ],
148    "it": [
149        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
150        for x in [
151            # ("sig.ra", "signora"),
152            ("sig", "signore"),
153            ("dr", "dottore"),
154            ("st", "santo"),
155            ("co", "compagnia"),
156            ("jr", "junior"),
157            ("ltd", "limitata"),
158        ]
159    ],
160    "pl": [
161        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
162        for x in [
163            ("p", "pani"),
164            ("m", "pan"),
165            ("dr", "doktor"),
166            ("sw", "święty"),
167            ("jr", "junior"),
168        ]
169    ],
170    "ar": [
171        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
172        for x in [
173            # There are not many common abbreviations in Arabic as in English.
174        ]
175    ],
176    "zh": [
177        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
178        for x in [
179            # Chinese doesn't typically use abbreviations in the same way as Latin-based scripts.
180        ]
181    ],
182    "cs": [
183        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
184        for x in [
185            ("dr", "doktor"),  # doctor
186            ("ing", "inženýr"),  # engineer
187            ("p", "pan"),  # Could also map to pani for woman but no easy way to do it
188            # Other abbreviations would be specialized and not as common.
189        ]
190    ],
191    "ru": [
192        (re.compile("\\b%s\\b" % x[0], re.IGNORECASE), x[1])
193        for x in [
194            ("г-жа", "госпожа"),  # Mrs.
195            ("г-н", "господин"),  # Mr.
196            ("д-р", "доктор"),  # doctor
197            # Other abbreviations are less common or specialized.
198        ]
199    ],
200    "nl": [
201        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
202        for x in [
203            ("dhr", "de heer"),  # Mr.
204            ("mevr", "mevrouw"),  # Mrs.
205            ("dr", "dokter"),  # doctor
206            ("jhr", "jonkheer"),  # young lord or nobleman
207            # Dutch uses more abbreviations, but these are the most common ones.
208        ]
209    ],
210    "tr": [
211        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
212        for x in [
213            ("b", "bay"),  # Mr.
214            ("byk", "büyük"),  # büyük
215            ("dr", "doktor"),  # doctor
216            # Add other Turkish abbreviations here if needed.
217        ]
218    ],
219    "hu": [
220        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
221        for x in [
222            ("dr", "doktor"),  # doctor
223            ("b", "bácsi"),  # Mr.
224            ("nőv", "nővér"),  # nurse
225            # Add other Hungarian abbreviations here if needed.
226        ]
227    ],
228    "ko": [
229        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
230        for x in [
231            # Korean doesn't typically use abbreviations in the same way as Latin-based scripts.
232        ]
233    ],
234}
235
236
237def expand_abbreviations_multilingual(text, lang="en"):
238    for regex, replacement in _abbreviations[lang]:
239        text = re.sub(regex, replacement, text)
240    return text
241
242
243_symbols_multilingual = {
244    "en": [
245        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
246        for x in [
247            ("&", " and "),
248            ("@", " at "),
249            ("%", " percent "),
250            ("#", " hash "),
251            ("$", " dollar "),
252            ("£", " pound "),
253            ("°", " degree "),
254        ]
255    ],
256    "es": [
257        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
258        for x in [
259            ("&", " y "),
260            ("@", " arroba "),
261            ("%", " por ciento "),
262            ("#", " numeral "),
263            ("$", " dolar "),
264            ("£", " libra "),
265            ("°", " grados "),
266        ]
267    ],
268    "fr": [
269        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
270        for x in [
271            ("&", " et "),
272            ("@", " arobase "),
273            ("%", " pour cent "),
274            ("#", " dièse "),
275            ("$", " dollar "),
276            ("£", " livre "),
277            ("°", " degrés "),
278        ]
279    ],
280    "de": [
281        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
282        for x in [
283            ("&", " und "),
284            ("@", " at "),
285            ("%", " prozent "),
286            ("#", " raute "),
287            ("$", " dollar "),
288            ("£", " pfund "),
289            ("°", " grad "),
290        ]
291    ],
292    "pt": [
293        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
294        for x in [
295            ("&", " e "),
296            ("@", " arroba "),
297            ("%", " por cento "),
298            ("#", " cardinal "),
299            ("$", " dólar "),
300            ("£", " libra "),
301            ("°", " graus "),
302        ]
303    ],
304    "it": [
305        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
306        for x in [
307            ("&", " e "),
308            ("@", " chiocciola "),
309            ("%", " per cento "),
310            ("#", " cancelletto "),
311            ("$", " dollaro "),
312            ("£", " sterlina "),
313            ("°", " gradi "),
314        ]
315    ],
316    "pl": [
317        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
318        for x in [
319            ("&", " i "),
320            ("@", " małpa "),
321            ("%", " procent "),
322            ("#", " krzyżyk "),
323            ("$", " dolar "),
324            ("£", " funt "),
325            ("°", " stopnie "),
326        ]
327    ],
328    "ar": [
329        # Arabic
330        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
331        for x in [
332            ("&", " و "),
333            ("@", " على "),
334            ("%", " في المئة "),
335            ("#", " رقم "),
336            ("$", " دولار "),
337            ("£", " جنيه "),
338            ("°", " درجة "),
339        ]
340    ],
341    "zh": [
342        # Chinese
343        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
344        for x in [
345            ("&", " 和 "),
346            ("@", " 在 "),
347            ("%", " 百分之 "),
348            ("#", " 号 "),
349            ("$", " 美元 "),
350            ("£", " 英镑 "),
351            ("°", " 度 "),
352        ]
353    ],
354    "cs": [
355        # Czech
356        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
357        for x in [
358            ("&", " a "),
359            ("@", " na "),
360            ("%", " procento "),
361            ("#", " křížek "),
362            ("$", " dolar "),
363            ("£", " libra "),
364            ("°", " stupně "),
365        ]
366    ],
367    "ru": [
368        # Russian
369        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
370        for x in [
371            ("&", " и "),
372            ("@", " собака "),
373            ("%", " процентов "),
374            ("#", " номер "),
375            ("$", " доллар "),
376            ("£", " фунт "),
377            ("°", " градус "),
378        ]
379    ],
380    "nl": [
381        # Dutch
382        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
383        for x in [
384            ("&", " en "),
385            ("@", " bij "),
386            ("%", " procent "),
387            ("#", " hekje "),
388            ("$", " dollar "),
389            ("£", " pond "),
390            ("°", " graden "),
391        ]
392    ],
393    "tr": [
394        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
395        for x in [
396            ("&", " ve "),
397            ("@", " at "),
398            ("%", " yüzde "),
399            ("#", " diyez "),
400            ("$", " dolar "),
401            ("£", " sterlin "),
402            ("°", " derece "),
403        ]
404    ],
405    "hu": [
406        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
407        for x in [
408            ("&", " és "),
409            ("@", " kukac "),
410            ("%", " százalék "),
411            ("#", " kettőskereszt "),
412            ("$", " dollár "),
413            ("£", " font "),
414            ("°", " fok "),
415        ]
416    ],
417    "ko": [
418        # Korean
419        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
420        for x in [
421            ("&", " 그리고 "),
422            ("@", " 에 "),
423            ("%", " 퍼센트 "),
424            ("#", " 번호 "),
425            ("$", " 달러 "),
426            ("£", " 파운드 "),
427            ("°", " 도 "),
428        ]
429    ],
430}
431
432
433def expand_symbols_multilingual(text, lang="en"):
434    for regex, replacement in _symbols_multilingual[lang]:
435        text = re.sub(regex, replacement, text)
436        text = text.replace("  ", " ")  # Ensure there are no double spaces
437    return text.strip()
438
439
440_ordinal_re = {
441    "en": re.compile(r"([0-9]+)(st|nd|rd|th)"),
442    "es": re.compile(r"([0-9]+)(º|ª|er|o|a|os|as)"),
443    "fr": re.compile(r"([0-9]+)(º|ª|er|re|e|ème)"),
444    "de": re.compile(r"([0-9]+)(st|nd|rd|th|º|ª|\.(?=\s|$))"),
445    "pt": re.compile(r"([0-9]+)(º|ª|o|a|os|as)"),
446    "it": re.compile(r"([0-9]+)(º|°|ª|o|a|i|e)"),
447    "pl": re.compile(r"([0-9]+)(º|ª|st|nd|rd|th)"),
448    "ar": re.compile(r"([0-9]+)(ون|ين|ث|ر|ى)"),
449    "cs": re.compile(
450        r"([0-9]+)\.(?=\s|$)"
451    ),  # In Czech, a dot is often used after the number to indicate ordinals.
452    "ru": re.compile(r"([0-9]+)(-й|-я|-е|-ое|-ье|-го)"),
453    "nl": re.compile(r"([0-9]+)(de|ste|e)"),
454    "tr": re.compile(r"([0-9]+)(\.|inci|nci|uncu|üncü|\.)"),
455    "hu": re.compile(r"([0-9]+)(\.|adik|edik|odik|edik|ödik|ödike|ik)"),
456    "ko": re.compile(r"([0-9]+)(번째|번|차|째)"),
457}
458_number_re = re.compile(r"[0-9]+")
459_currency_re = {
460    "USD": re.compile(r"((\$[0-9\.\,]*[0-9]+)|([0-9\.\,]*[0-9]+\$))"),
461    "GBP": re.compile(r"((£[0-9\.\,]*[0-9]+)|([0-9\.\,]*[0-9]+£))"),
462    "EUR": re.compile(r"(([0-9\.\,]*[0-9]+€)|((€[0-9\.\,]*[0-9]+)))"),
463}
464
465_comma_number_re = re.compile(r"\b\d{1,3}(,\d{3})*(\.\d+)?\b")
466_dot_number_re = re.compile(r"\b\d{1,3}(.\d{3})*(\,\d+)?\b")
467_decimal_number_re = re.compile(r"([0-9]+[.,][0-9]+)")
468
469
470def _remove_commas(m):
471    text = m.group(0)
472    if "," in text:
473        text = text.replace(",", "")
474    return text
475
476
477def _remove_dots(m):
478    text = m.group(0)
479    if "." in text:
480        text = text.replace(".", "")
481    return text
482
483
484def _expand_decimal_point(m, lang="en"):
485    amount = m.group(1).replace(",", ".")
486    return num2words(float(amount), lang=lang if lang != "cs" else "cz")
487
488
489def _expand_currency(m, lang="en", currency="USD"):
490    amount = float((re.sub(r"[^\d.]", "", m.group(0).replace(",", "."))))
491    full_amount = num2words(
492        amount, to="currency", currency=currency, lang=lang if lang != "cs" else "cz"
493    )
494
495    and_equivalents = {
496        "en": ", ",
497        "es": " con ",
498        "fr": " et ",
499        "de": " und ",
500        "pt": " e ",
501        "it": " e ",
502        "pl": ", ",
503        "cs": ", ",
504        "ru": ", ",
505        "nl": ", ",
506        "ar": ", ",
507        "tr": ", ",
508        "hu": ", ",
509        "ko": ", ",
510    }
511
512    if amount.is_integer():
513        last_and = full_amount.rfind(and_equivalents[lang])
514        if last_and != -1:
515            full_amount = full_amount[:last_and]
516
517    return full_amount
518
519
520def _expand_ordinal(m, lang="en"):
521    return num2words(int(m.group(1)), ordinal=True, lang=lang if lang != "cs" else "cz")
522
523
524def _expand_number(m, lang="en"):
525    return num2words(int(m.group(0)), lang=lang if lang != "cs" else "cz")
526
527
528def expand_numbers_multilingual(text, lang="en"):
529    if lang == "zh":
530        text = zh_num2words()(text)
531    else:
532        if lang in ["en", "ru"]:
533            text = re.sub(_comma_number_re, _remove_commas, text)
534        else:
535            text = re.sub(_dot_number_re, _remove_dots, text)
536        try:
537            text = re.sub(
538                _currency_re["GBP"], lambda m: _expand_currency(m, lang, "GBP"), text
539            )
540            text = re.sub(
541                _currency_re["USD"], lambda m: _expand_currency(m, lang, "USD"), text
542            )
543            text = re.sub(
544                _currency_re["EUR"], lambda m: _expand_currency(m, lang, "EUR"), text
545            )
546        except:
547            pass
548        if lang != "tr":
549            text = re.sub(
550                _decimal_number_re, lambda m: _expand_decimal_point(m, lang), text
551            )
552        text = re.sub(_ordinal_re[lang], lambda m: _expand_ordinal(m, lang), text)
553        text = re.sub(_number_re, lambda m: _expand_number(m, lang), text)
554    return text
555
556
557def lowercase(text):
558    return text.lower()
559
560
561def collapse_whitespace(text):
562    return re.sub(_whitespace_re, " ", text)
563
564
565def multilingual_cleaners(text, lang):
566    text = text.replace('"', "")
567    if lang == "tr":
568        text = text.replace("İ", "i")
569        text = text.replace("Ö", "ö")
570        text = text.replace("Ü", "ü")
571    text = lowercase(text)
572    try:
573        text = expand_numbers_multilingual(text, lang)
574    except:
575        pass
576    try:
577        text = expand_abbreviations_multilingual(text, lang)
578    except:
579        pass
580    try:
581        text = expand_symbols_multilingual(text, lang=lang)
582    except:
583        pass
584    text = collapse_whitespace(text)
585    return text
586
587
588def basic_cleaners(text):
589    """Basic pipeline that lowercases and collapses whitespace without transliteration."""
590    text = lowercase(text)
591    text = collapse_whitespace(text)
592    return text
593
594
595def chinese_transliterate(text):
596    return "".join(
597        [
598            p[0]
599            for p in pypinyin.pinyin(
600                text,
601                style=pypinyin.Style.TONE3,
602                heteronym=False,
603                neutral_tone_with_five=True,
604            )
605        ]
606    )
607
608
609def japanese_cleaners(text, katsu):
610    text = katsu.romaji(text)
611    text = lowercase(text)
612    return text
613
614
615def korean_transliterate(text):
616    r = Transliter(academic)
617    return r.translit(text)
618
619
620DEFAULT_VOCAB_FILE = os.path.join(
621    os.path.dirname(os.path.realpath(__file__)), "vocab.json"
622)
623
624
625class VoiceBpeTokenizer:
626    def __init__(self, vocab_file=DEFAULT_VOCAB_FILE):
627        self.tokenizer = None
628        if vocab_file is not None:
629            self.tokenizer = Tokenizer.from_file(vocab_file)
630        self.char_limits = {
631            "en": 10000,
632            "de": 253,
633            "fr": 273,
634            "es": 239,
635            "it": 213,
636            "pt": 203,
637            "pl": 224,
638            "zh": 82,
639            "ar": 166,
640            "cs": 186,
641            "ru": 182,
642            "nl": 251,
643            "tr": 226,
644            "ja": 71,
645            "hu": 224,
646            "ko": 95,
647        }
648
649    @cached_property
650    def katsu(self):
651        import cutlet
652
653        return cutlet.Cutlet()
654
655    def check_input_length(self, txt, lang):
656        lang = lang.split("-")[0]  # remove the region
657        limit = self.char_limits.get(lang, 250)
658        # if len(txt) > limit:
659        #     print(
660        #         f"[!] Warning: The text length exceeds the character limit of {limit} for language '{lang}', this might cause truncated audio."
661        #     )
662
663    def preprocess_text(self, txt, lang):
664        if lang in {
665            "ar",
666            "cs",
667            "de",
668            "en",
669            "es",
670            "fr",
671            "hu",
672            "it",
673            "nl",
674            "pl",
675            "pt",
676            "ru",
677            "tr",
678            "zh",
679            "ko",
680        }:
681            txt = multilingual_cleaners(txt, lang)
682            if lang == "zh":
683                txt = chinese_transliterate(txt)
684            if lang == "ko":
685                txt = korean_transliterate(txt)
686        elif lang == "ja":
687            txt = japanese_cleaners(txt, self.katsu)
688        elif lang == "hi":
689            # @manmay will implement this
690            txt = basic_cleaners(txt)
691        else:
692            raise NotImplementedError(f"Language '{lang}' is not supported.")
693        return txt
694
695    def encode(self, txt, lang):
696        lang = lang.split("-")[0]  # remove the region
697        self.check_input_length(txt, lang)
698        txt = self.preprocess_text(txt, lang)
699        lang = "zh-cn" if lang == "zh" else lang
700        txt = f"[{lang}]{txt}"
701        txt = txt.replace(" ", "[SPACE]")
702        return self.tokenizer.encode(txt).ids
703
704    def decode(self, seq, skip_special_tokens=False):
705        if isinstance(seq, torch.Tensor):
706            seq = seq.cpu().numpy()
707        txt = self.tokenizer.decode(seq, skip_special_tokens=False).replace(" ", "")
708        txt = txt.replace("[SPACE]", " ")
709        txt = txt.replace("[STOP]", "")
710        # txt = txt.replace("[UNK]", "")
711        return txt
712
713    # copy from https://github.com/huggingface/transformers/blob/main/src/transformers/tokenization_utils_base.py#L3936
714    def batch_decode(
715        self,
716        sequences: Union[
717            List[int], List[List[int]], "np.ndarray", "torch.Tensor", "tf.Tensor"
718        ],
719        skip_special_tokens: bool = False,
720    ) -> List[str]:
721        """
722        Convert a list of lists of token ids into a list of strings by calling decode.
723
724        Args:
725            sequences (`Union[List[int], List[List[int]], np.ndarray, torch.Tensor, tf.Tensor]`):
726                List of tokenized input ids. Can be obtained using the `__call__` method.
727            skip_special_tokens (`bool`, *optional*, defaults to `False`):
728                Whether or not to remove special tokens in the decoding.
729            kwargs (additional keyword arguments, *optional*):
730                Will be passed to the underlying model specific decode method.
731
732        Returns:
733            `List[str]`: The list of decoded sentences.
734        """
735        return [self.decode(seq) for seq in sequences]
736
737    # https://github.com/coqui-ai/TTS/blob/dev/TTS/tts/layers/xtts/trainer/dataset.py#L202
738    # def pad(self):
739
740    def __len__(self):
741        return self.tokenizer.get_vocab_size()
742
743    def get_number_tokens(self):
744        return max(self.tokenizer.get_vocab().values()) + 1
745
746
747def test_expand_numbers_multilingual():
748    test_cases = [
749        # English
750        ("In 12.5 seconds.", "In twelve point five seconds.", "en"),
751        ("There were 50 soldiers.", "There were fifty soldiers.", "en"),
752        ("This is a 1st test", "This is a first test", "en"),
753        ("That will be $20 sir.", "That will be twenty dollars sir.", "en"),
754        ("That will be 20€ sir.", "That will be twenty euro sir.", "en"),
755        (
756            "That will be 20.15€ sir.",
757            "That will be twenty euro, fifteen cents sir.",
758            "en",
759        ),
760        ("That's 100,000.5.", "That's one hundred thousand point five.", "en"),
761        # French
762        ("En 12,5 secondes.", "En douze virgule cinq secondes.", "fr"),
763        ("Il y avait 50 soldats.", "Il y avait cinquante soldats.", "fr"),
764        ("Ceci est un 1er test", "Ceci est un premier test", "fr"),
765        (
766            "Cela vous fera $20 monsieur.",
767            "Cela vous fera vingt dollars monsieur.",
768            "fr",
769        ),
770        ("Cela vous fera 20€ monsieur.", "Cela vous fera vingt euros monsieur.", "fr"),
771        (
772            "Cela vous fera 20,15€ monsieur.",
773            "Cela vous fera vingt euros et quinze centimes monsieur.",
774            "fr",
775        ),
776        ("Ce sera 100.000,5.", "Ce sera cent mille virgule cinq.", "fr"),
777        # German
778        ("In 12,5 Sekunden.", "In zwölf Komma fünf Sekunden.", "de"),
779        ("Es gab 50 Soldaten.", "Es gab fünfzig Soldaten.", "de"),
780        ("Dies ist ein 1. Test", "Dies ist ein erste Test", "de"),  # Issue with gender
781        ("Das macht $20 Herr.", "Das macht zwanzig Dollar Herr.", "de"),
782        ("Das macht 20€ Herr.", "Das macht zwanzig Euro Herr.", "de"),
783        (
784            "Das macht 20,15€ Herr.",
785            "Das macht zwanzig Euro und fünfzehn Cent Herr.",
786            "de",
787        ),
788        # Spanish
789        ("En 12,5 segundos.", "En doce punto cinco segundos.", "es"),
790        ("Había 50 soldados.", "Había cincuenta soldados.", "es"),
791        ("Este es un 1er test", "Este es un primero test", "es"),
792        ("Eso le costará $20 señor.", "Eso le costará veinte dólares señor.", "es"),
793        ("Eso le costará 20€ señor.", "Eso le costará veinte euros señor.", "es"),
794        (
795            "Eso le costará 20,15€ señor.",
796            "Eso le costará veinte euros con quince céntimos señor.",
797            "es",
798        ),
799        # Italian
800        ("In 12,5 secondi.", "In dodici virgola cinque secondi.", "it"),
801        ("C'erano 50 soldati.", "C'erano cinquanta soldati.", "it"),
802        ("Questo è un 1° test", "Questo è un primo test", "it"),
803        ("Ti costerà $20 signore.", "Ti costerà venti dollari signore.", "it"),
804        ("Ti costerà 20€ signore.", "Ti costerà venti euro signore.", "it"),
805        (
806            "Ti costerà 20,15€ signore.",
807            "Ti costerà venti euro e quindici centesimi signore.",
808            "it",
809        ),
810        # Portuguese
811        ("Em 12,5 segundos.", "Em doze vírgula cinco segundos.", "pt"),
812        ("Havia 50 soldados.", "Havia cinquenta soldados.", "pt"),
813        ("Este é um 1º teste", "Este é um primeiro teste", "pt"),
814        ("Isso custará $20 senhor.", "Isso custará vinte dólares senhor.", "pt"),
815        ("Isso custará 20€ senhor.", "Isso custará vinte euros senhor.", "pt"),
816        (
817            "Isso custará 20,15€ senhor.",
818            "Isso custará vinte euros e quinze cêntimos senhor.",
819            "pt",
820        ),  # "cêntimos" should be "centavos" num2words issue
821        # Polish
822        ("W 12,5 sekundy.", "W dwanaście przecinek pięć sekundy.", "pl"),
823        ("Było 50 żołnierzy.", "Było pięćdziesiąt żołnierzy.", "pl"),
824        (
825            "To będzie kosztować 20€ panie.",
826            "To będzie kosztować dwadzieścia euro panie.",
827            "pl",
828        ),
829        (
830            "To będzie kosztować 20,15€ panie.",
831            "To będzie kosztować dwadzieścia euro, piętnaście centów panie.",
832            "pl",
833        ),
834        # Arabic
835        ("في الـ 12,5 ثانية.", "في الـ اثنا عشر  , خمسون ثانية.", "ar"),
836        ("كان هناك 50 جنديًا.", "كان هناك خمسون جنديًا.", "ar"),
837        # ("ستكون النتيجة $20 يا سيد.", 'ستكون النتيجة عشرون دولار يا سيد.', 'ar'), # $ and € are mising from num2words
838        # ("ستكون النتيجة 20€ يا سيد.", 'ستكون النتيجة عشرون يورو يا سيد.', 'ar'),
839        # Czech
840        ("Za 12,5 vteřiny.", "Za dvanáct celá pět vteřiny.", "cs"),
841        ("Bylo tam 50 vojáků.", "Bylo tam padesát vojáků.", "cs"),
842        ("To bude stát 20€ pane.", "To bude stát dvacet euro pane.", "cs"),
843        ("To bude 20.15€ pane.", "To bude dvacet euro, patnáct centů pane.", "cs"),
844        # Russian
845        ("Через 12.5 секунды.", "Через двенадцать запятая пять секунды.", "ru"),
846        ("Там было 50 солдат.", "Там было пятьдесят солдат.", "ru"),
847        (
848            "Это будет 20.15€ сэр.",
849            "Это будет двадцать евро, пятнадцать центов сэр.",
850            "ru",
851        ),
852        (
853            "Это будет стоить 20€ господин.",
854            "Это будет стоить двадцать евро господин.",
855            "ru",
856        ),
857        # Dutch
858        ("In 12,5 seconden.", "In twaalf komma vijf seconden.", "nl"),
859        ("Er waren 50 soldaten.", "Er waren vijftig soldaten.", "nl"),
860        ("Dat wordt dan $20 meneer.", "Dat wordt dan twintig dollar meneer.", "nl"),
861        ("Dat wordt dan 20€ meneer.", "Dat wordt dan twintig euro meneer.", "nl"),
862        # Chinese (Simplified)
863        ("在12.5秒内", "在十二点五秒内", "zh"),
864        ("有50名士兵", "有五十名士兵", "zh"),
865        # ("那将是$20先生", '那将是二十美元先生', 'zh'), currency doesn't work
866        # ("那将是20€先生", '那将是二十欧元先生', 'zh'),
867        # Turkish
868        # ("12,5 saniye içinde.", 'On iki virgül beş saniye içinde.', 'tr'), # decimal doesn't work for TR
869        ("50 asker vardı.", "elli asker vardı.", "tr"),
870        ("Bu 1. test", "Bu birinci test", "tr"),
871        # ("Bu 100.000,5.", 'Bu yüz bin virgül beş.', 'tr'),
872        # Hungarian
873        ("12,5 másodperc alatt.", "tizenkettő egész öt tized másodperc alatt.", "hu"),
874        ("50 katona volt.", "ötven katona volt.", "hu"),
875        ("Ez az 1. teszt", "Ez az első teszt", "hu"),
876        # Korean
877        ("12.5 초 안에.", "십이 점 다섯 초 안에.", "ko"),
878        ("50 명의 병사가 있었다.", "오십 명의 병사가 있었다.", "ko"),
879        ("이것은 1 번째 테스트입니다", "이것은 첫 번째 테스트입니다", "ko"),
880    ]
881    for a, b, lang in test_cases:
882        out = expand_numbers_multilingual(a, lang=lang)
883        assert out == b, f"'{out}' vs '{b}'"
884
885
886def test_abbreviations_multilingual():
887    test_cases = [
888        # English
889        ("Hello Mr. Smith.", "Hello mister Smith.", "en"),
890        ("Dr. Jones is here.", "doctor Jones is here.", "en"),
891        # Spanish
892        ("Hola Sr. Garcia.", "Hola señor Garcia.", "es"),
893        ("La Dra. Martinez es muy buena.", "La doctora Martinez es muy buena.", "es"),
894        # French
895        ("Bonjour Mr. Dupond.", "Bonjour monsieur Dupond.", "fr"),
896        (
897            "Mme. Moreau est absente aujourd'hui.",
898            "madame Moreau est absente aujourd'hui.",
899            "fr",
900        ),
901        # German
902        ("Frau Dr. Müller ist sehr klug.", "Frau doktor Müller ist sehr klug.", "de"),
903        # Portuguese
904        ("Olá Sr. Silva.", "Olá senhor Silva.", "pt"),
905        (
906            "Dra. Costa, você está disponível?",
907            "doutora Costa, você está disponível?",
908            "pt",
909        ),
910        # Italian
911        ("Buongiorno, Sig. Rossi.", "Buongiorno, signore Rossi.", "it"),
912        # ("Sig.ra Bianchi, posso aiutarti?", 'signora Bianchi, posso aiutarti?', 'it'), # Issue with matching that pattern
913        # Polish
914        ("Dzień dobry, P. Kowalski.", "Dzień dobry, pani Kowalski.", "pl"),
915        (
916            "M. Nowak, czy mogę zadać pytanie?",
917            "pan Nowak, czy mogę zadać pytanie?",
918            "pl",
919        ),
920        # Czech
921        ("P. Novák", "pan Novák", "cs"),
922        ("Dr. Vojtěch", "doktor Vojtěch", "cs"),
923        # Dutch
924        ("Dhr. Jansen", "de heer Jansen", "nl"),
925        ("Mevr. de Vries", "mevrouw de Vries", "nl"),
926        # Russian
927        ("Здравствуйте Г-н Иванов.", "Здравствуйте господин Иванов.", "ru"),
928        (
929            "Д-р Смирнов здесь, чтобы увидеть вас.",
930            "доктор Смирнов здесь, чтобы увидеть вас.",
931            "ru",
932        ),
933        # Turkish
934        ("Merhaba B. Yılmaz.", "Merhaba bay Yılmaz.", "tr"),
935        ("Dr. Ayşe burada.", "doktor Ayşe burada.", "tr"),
936        # Hungarian
937        ("Dr. Szabó itt van.", "doktor Szabó itt van.", "hu"),
938    ]
939
940    for a, b, lang in test_cases:
941        out = expand_abbreviations_multilingual(a, lang=lang)
942        assert out == b, f"'{out}' vs '{b}'"
943
944
945def test_symbols_multilingual():
946    test_cases = [
947        ("I have 14% battery", "I have 14 percent battery", "en"),
948        ("Te veo @ la fiesta", "Te veo arroba la fiesta", "es"),
949        ("J'ai 14° de fièvre", "J'ai 14 degrés de fièvre", "fr"),
950        ("Die Rechnung beträgt £ 20", "Die Rechnung beträgt pfund 20", "de"),
951        (
952            "O meu email é ana&joao@gmail.com",
953            "O meu email é ana e joao arroba gmail.com",
954            "pt",
955        ),
956        (
957            "linguaggio di programmazione C#",
958            "linguaggio di programmazione C cancelletto",
959            "it",
960        ),
961        ("Moja temperatura to 36.6°", "Moja temperatura to 36.6 stopnie", "pl"),
962        ("Mám 14% baterie", "Mám 14 procento baterie", "cs"),
963        ("Těším se na tebe @ party", "Těším se na tebe na party", "cs"),
964        ("У меня 14% заряда", "У меня 14 процентов заряда", "ru"),
965        ("Я буду @ дома", "Я буду собака дома", "ru"),
966        ("Ik heb 14% batterij", "Ik heb 14 procent batterij", "nl"),
967        ("Ik zie je @ het feest", "Ik zie je bij het feest", "nl"),
968        ("لدي 14% في البطارية", "لدي 14 في المئة في البطارية", "ar"),
969        ("我的电量为 14%", "我的电量为 14 百分之", "zh"),
970        ("Pilim %14 dolu.", "Pilim yüzde 14 dolu.", "tr"),
971        (
972            "Az akkumulátorom töltöttsége 14%",
973            "Az akkumulátorom töltöttsége 14 százalék",
974            "hu",
975        ),
976        ("배터리 잔량이 14%입니다.", "배터리 잔량이 14 퍼센트입니다.", "ko"),
977    ]
978
979    for a, b, lang in test_cases:
980        out = expand_symbols_multilingual(a, lang=lang)
981        assert out == b, f"'{out}' vs '{b}'"
982
983
984if __name__ == "__main__":
985    test_expand_numbers_multilingual()
986    test_abbreviations_multilingual()
987    test_symbols_multilingual()

def get_spacy_lang(lang): View Source

24def get_spacy_lang(lang):
25    if lang == "zh":
26        return Chinese()
27    elif lang == "ja":
28        return Japanese()
29    elif lang == "ar":
30        return Arabic()
31    elif lang == "es":
32        return Spanish()
33    else:
34        # For most languages, Enlish does the job
35        return English()

def split_sentence(text, lang, text_split_length=250): View Source

38def split_sentence(text, lang, text_split_length=250):
39    """Preprocess the input text"""
40    text_splits = []
41    if text_split_length is not None and len(text) >= text_split_length:
42        text_splits.append("")
43        nlp = get_spacy_lang(lang)
44        nlp.add_pipe("sentencizer")
45        doc = nlp(text)
46        for sentence in doc.sents:
47            if len(text_splits[-1]) + len(str(sentence)) <= text_split_length:
48                # if the last sentence + the current sentence is less than the text_split_length
49                # then add the current sentence to the last sentence
50                text_splits[-1] += " " + str(sentence)
51                text_splits[-1] = text_splits[-1].lstrip()
52            elif len(str(sentence)) > text_split_length:
53                # if the current sentence is greater than the text_split_length
54                for line in textwrap.wrap(
55                    str(sentence),
56                    width=text_split_length,
57                    drop_whitespace=True,
58                    break_on_hyphens=False,
59                    tabsize=1,
60                ):
61                    text_splits.append(str(line))
62            else:
63                text_splits.append(str(sentence))
64
65        if len(text_splits) > 1:
66            if text_splits[0] == "":
67                del text_splits[0]
68    else:
69        text_splits = [text.lstrip()]
70
71    return text_splits

Preprocess the input text

def expand_abbreviations_multilingual(text, lang='en'): View Source

238def expand_abbreviations_multilingual(text, lang="en"):
239    for regex, replacement in _abbreviations[lang]:
240        text = re.sub(regex, replacement, text)
241    return text

def expand_symbols_multilingual(text, lang='en'): View Source

434def expand_symbols_multilingual(text, lang="en"):
435    for regex, replacement in _symbols_multilingual[lang]:
436        text = re.sub(regex, replacement, text)
437        text = text.replace("  ", " ")  # Ensure there are no double spaces
438    return text.strip()

def expand_numbers_multilingual(text, lang='en'): View Source

529def expand_numbers_multilingual(text, lang="en"):
530    if lang == "zh":
531        text = zh_num2words()(text)
532    else:
533        if lang in ["en", "ru"]:
534            text = re.sub(_comma_number_re, _remove_commas, text)
535        else:
536            text = re.sub(_dot_number_re, _remove_dots, text)
537        try:
538            text = re.sub(
539                _currency_re["GBP"], lambda m: _expand_currency(m, lang, "GBP"), text
540            )
541            text = re.sub(
542                _currency_re["USD"], lambda m: _expand_currency(m, lang, "USD"), text
543            )
544            text = re.sub(
545                _currency_re["EUR"], lambda m: _expand_currency(m, lang, "EUR"), text
546            )
547        except:
548            pass
549        if lang != "tr":
550            text = re.sub(
551                _decimal_number_re, lambda m: _expand_decimal_point(m, lang), text
552            )
553        text = re.sub(_ordinal_re[lang], lambda m: _expand_ordinal(m, lang), text)
554        text = re.sub(_number_re, lambda m: _expand_number(m, lang), text)
555    return text

def lowercase(text): View Source

558def lowercase(text):
559    return text.lower()

def collapse_whitespace(text): View Source

562def collapse_whitespace(text):
563    return re.sub(_whitespace_re, " ", text)

def multilingual_cleaners(text, lang): View Source

566def multilingual_cleaners(text, lang):
567    text = text.replace('"', "")
568    if lang == "tr":
569        text = text.replace("İ", "i")
570        text = text.replace("Ö", "ö")
571        text = text.replace("Ü", "ü")
572    text = lowercase(text)
573    try:
574        text = expand_numbers_multilingual(text, lang)
575    except:
576        pass
577    try:
578        text = expand_abbreviations_multilingual(text, lang)
579    except:
580        pass
581    try:
582        text = expand_symbols_multilingual(text, lang=lang)
583    except:
584        pass
585    text = collapse_whitespace(text)
586    return text

def basic_cleaners(text): View Source

589def basic_cleaners(text):
590    """Basic pipeline that lowercases and collapses whitespace without transliteration."""
591    text = lowercase(text)
592    text = collapse_whitespace(text)
593    return text

Basic pipeline that lowercases and collapses whitespace without transliteration.

def chinese_transliterate(text): View Source

596def chinese_transliterate(text):
597    return "".join(
598        [
599            p[0]
600            for p in pypinyin.pinyin(
601                text,
602                style=pypinyin.Style.TONE3,
603                heteronym=False,
604                neutral_tone_with_five=True,
605            )
606        ]
607    )

def japanese_cleaners(text, katsu): View Source

610def japanese_cleaners(text, katsu):
611    text = katsu.romaji(text)
612    text = lowercase(text)
613    return text

def korean_transliterate(text): View Source

616def korean_transliterate(text):
617    r = Transliter(academic)
618    return r.translit(text)

DEFAULT_VOCAB_FILE = '/Users/e6d64/Documents/GitHub/darkshapes/divisor/divisor/acestep/models/lyrics_utils/vocab.json'

class VoiceBpeTokenizer: View Source

626class VoiceBpeTokenizer:
627    def __init__(self, vocab_file=DEFAULT_VOCAB_FILE):
628        self.tokenizer = None
629        if vocab_file is not None:
630            self.tokenizer = Tokenizer.from_file(vocab_file)
631        self.char_limits = {
632            "en": 10000,
633            "de": 253,
634            "fr": 273,
635            "es": 239,
636            "it": 213,
637            "pt": 203,
638            "pl": 224,
639            "zh": 82,
640            "ar": 166,
641            "cs": 186,
642            "ru": 182,
643            "nl": 251,
644            "tr": 226,
645            "ja": 71,
646            "hu": 224,
647            "ko": 95,
648        }
649
650    @cached_property
651    def katsu(self):
652        import cutlet
653
654        return cutlet.Cutlet()
655
656    def check_input_length(self, txt, lang):
657        lang = lang.split("-")[0]  # remove the region
658        limit = self.char_limits.get(lang, 250)
659        # if len(txt) > limit:
660        #     print(
661        #         f"[!] Warning: The text length exceeds the character limit of {limit} for language '{lang}', this might cause truncated audio."
662        #     )
663
664    def preprocess_text(self, txt, lang):
665        if lang in {
666            "ar",
667            "cs",
668            "de",
669            "en",
670            "es",
671            "fr",
672            "hu",
673            "it",
674            "nl",
675            "pl",
676            "pt",
677            "ru",
678            "tr",
679            "zh",
680            "ko",
681        }:
682            txt = multilingual_cleaners(txt, lang)
683            if lang == "zh":
684                txt = chinese_transliterate(txt)
685            if lang == "ko":
686                txt = korean_transliterate(txt)
687        elif lang == "ja":
688            txt = japanese_cleaners(txt, self.katsu)
689        elif lang == "hi":
690            # @manmay will implement this
691            txt = basic_cleaners(txt)
692        else:
693            raise NotImplementedError(f"Language '{lang}' is not supported.")
694        return txt
695
696    def encode(self, txt, lang):
697        lang = lang.split("-")[0]  # remove the region
698        self.check_input_length(txt, lang)
699        txt = self.preprocess_text(txt, lang)
700        lang = "zh-cn" if lang == "zh" else lang
701        txt = f"[{lang}]{txt}"
702        txt = txt.replace(" ", "[SPACE]")
703        return self.tokenizer.encode(txt).ids
704
705    def decode(self, seq, skip_special_tokens=False):
706        if isinstance(seq, torch.Tensor):
707            seq = seq.cpu().numpy()
708        txt = self.tokenizer.decode(seq, skip_special_tokens=False).replace(" ", "")
709        txt = txt.replace("[SPACE]", " ")
710        txt = txt.replace("[STOP]", "")
711        # txt = txt.replace("[UNK]", "")
712        return txt
713
714    # copy from https://github.com/huggingface/transformers/blob/main/src/transformers/tokenization_utils_base.py#L3936
715    def batch_decode(
716        self,
717        sequences: Union[
718            List[int], List[List[int]], "np.ndarray", "torch.Tensor", "tf.Tensor"
719        ],
720        skip_special_tokens: bool = False,
721    ) -> List[str]:
722        """
723        Convert a list of lists of token ids into a list of strings by calling decode.
724
725        Args:
726            sequences (`Union[List[int], List[List[int]], np.ndarray, torch.Tensor, tf.Tensor]`):
727                List of tokenized input ids. Can be obtained using the `__call__` method.
728            skip_special_tokens (`bool`, *optional*, defaults to `False`):
729                Whether or not to remove special tokens in the decoding.
730            kwargs (additional keyword arguments, *optional*):
731                Will be passed to the underlying model specific decode method.
732
733        Returns:
734            `List[str]`: The list of decoded sentences.
735        """
736        return [self.decode(seq) for seq in sequences]
737
738    # https://github.com/coqui-ai/TTS/blob/dev/TTS/tts/layers/xtts/trainer/dataset.py#L202
739    # def pad(self):
740
741    def __len__(self):
742        return self.tokenizer.get_vocab_size()
743
744    def get_number_tokens(self):
745        return max(self.tokenizer.get_vocab().values()) + 1

VoiceBpeTokenizer( vocab_file='/Users/e6d64/Documents/GitHub/darkshapes/divisor/divisor/acestep/models/lyrics_utils/vocab.json') View Source

627    def __init__(self, vocab_file=DEFAULT_VOCAB_FILE):
628        self.tokenizer = None
629        if vocab_file is not None:
630            self.tokenizer = Tokenizer.from_file(vocab_file)
631        self.char_limits = {
632            "en": 10000,
633            "de": 253,
634            "fr": 273,
635            "es": 239,
636            "it": 213,
637            "pt": 203,
638            "pl": 224,
639            "zh": 82,
640            "ar": 166,
641            "cs": 186,
642            "ru": 182,
643            "nl": 251,
644            "tr": 226,
645            "ja": 71,
646            "hu": 224,
647            "ko": 95,
648        }

tokenizer

char_limits

katsu View Source

650    @cached_property
651    def katsu(self):
652        import cutlet
653
654        return cutlet.Cutlet()

def check_input_length(self, txt, lang): View Source

656    def check_input_length(self, txt, lang):
657        lang = lang.split("-")[0]  # remove the region
658        limit = self.char_limits.get(lang, 250)
659        # if len(txt) > limit:
660        #     print(
661        #         f"[!] Warning: The text length exceeds the character limit of {limit} for language '{lang}', this might cause truncated audio."
662        #     )

def preprocess_text(self, txt, lang): View Source

664    def preprocess_text(self, txt, lang):
665        if lang in {
666            "ar",
667            "cs",
668            "de",
669            "en",
670            "es",
671            "fr",
672            "hu",
673            "it",
674            "nl",
675            "pl",
676            "pt",
677            "ru",
678            "tr",
679            "zh",
680            "ko",
681        }:
682            txt = multilingual_cleaners(txt, lang)
683            if lang == "zh":
684                txt = chinese_transliterate(txt)
685            if lang == "ko":
686                txt = korean_transliterate(txt)
687        elif lang == "ja":
688            txt = japanese_cleaners(txt, self.katsu)
689        elif lang == "hi":
690            # @manmay will implement this
691            txt = basic_cleaners(txt)
692        else:
693            raise NotImplementedError(f"Language '{lang}' is not supported.")
694        return txt

def encode(self, txt, lang): View Source

696    def encode(self, txt, lang):
697        lang = lang.split("-")[0]  # remove the region
698        self.check_input_length(txt, lang)
699        txt = self.preprocess_text(txt, lang)
700        lang = "zh-cn" if lang == "zh" else lang
701        txt = f"[{lang}]{txt}"
702        txt = txt.replace(" ", "[SPACE]")
703        return self.tokenizer.encode(txt).ids

def decode(self, seq, skip_special_tokens=False): View Source

705    def decode(self, seq, skip_special_tokens=False):
706        if isinstance(seq, torch.Tensor):
707            seq = seq.cpu().numpy()
708        txt = self.tokenizer.decode(seq, skip_special_tokens=False).replace(" ", "")
709        txt = txt.replace("[SPACE]", " ")
710        txt = txt.replace("[STOP]", "")
711        # txt = txt.replace("[UNK]", "")
712        return txt

def batch_decode( self, sequences: Union[List[int], List[List[int]], ForwardRef('np.ndarray'), ForwardRef('torch.Tensor'), ForwardRef('tf.Tensor')], skip_special_tokens: bool = False) -> List[str]: View Source

715    def batch_decode(
716        self,
717        sequences: Union[
718            List[int], List[List[int]], "np.ndarray", "torch.Tensor", "tf.Tensor"
719        ],
720        skip_special_tokens: bool = False,
721    ) -> List[str]:
722        """
723        Convert a list of lists of token ids into a list of strings by calling decode.
724
725        Args:
726            sequences (`Union[List[int], List[List[int]], np.ndarray, torch.Tensor, tf.Tensor]`):
727                List of tokenized input ids. Can be obtained using the `__call__` method.
728            skip_special_tokens (`bool`, *optional*, defaults to `False`):
729                Whether or not to remove special tokens in the decoding.
730            kwargs (additional keyword arguments, *optional*):
731                Will be passed to the underlying model specific decode method.
732
733        Returns:
734            `List[str]`: The list of decoded sentences.
735        """
736        return [self.decode(seq) for seq in sequences]

Convert a list of lists of token ids into a list of strings by calling decode.

Args: sequences (Union[List[int], List[List[int]], np.ndarray, torch.Tensor, tf.Tensor]): List of tokenized input ids. Can be obtained using the __call__ method. skip_special_tokens (bool, optional, defaults to False): Whether or not to remove special tokens in the decoding. kwargs (additional keyword arguments, optional): Will be passed to the underlying model specific decode method.

Returns: List[str]: The list of decoded sentences.

def get_number_tokens(self): View Source

744    def get_number_tokens(self):
745        return max(self.tokenizer.get_vocab().values()) + 1

def test_expand_numbers_multilingual(): View Source

748def test_expand_numbers_multilingual():
749    test_cases = [
750        # English
751        ("In 12.5 seconds.", "In twelve point five seconds.", "en"),
752        ("There were 50 soldiers.", "There were fifty soldiers.", "en"),
753        ("This is a 1st test", "This is a first test", "en"),
754        ("That will be $20 sir.", "That will be twenty dollars sir.", "en"),
755        ("That will be 20€ sir.", "That will be twenty euro sir.", "en"),
756        (
757            "That will be 20.15€ sir.",
758            "That will be twenty euro, fifteen cents sir.",
759            "en",
760        ),
761        ("That's 100,000.5.", "That's one hundred thousand point five.", "en"),
762        # French
763        ("En 12,5 secondes.", "En douze virgule cinq secondes.", "fr"),
764        ("Il y avait 50 soldats.", "Il y avait cinquante soldats.", "fr"),
765        ("Ceci est un 1er test", "Ceci est un premier test", "fr"),
766        (
767            "Cela vous fera $20 monsieur.",
768            "Cela vous fera vingt dollars monsieur.",
769            "fr",
770        ),
771        ("Cela vous fera 20€ monsieur.", "Cela vous fera vingt euros monsieur.", "fr"),
772        (
773            "Cela vous fera 20,15€ monsieur.",
774            "Cela vous fera vingt euros et quinze centimes monsieur.",
775            "fr",
776        ),
777        ("Ce sera 100.000,5.", "Ce sera cent mille virgule cinq.", "fr"),
778        # German
779        ("In 12,5 Sekunden.", "In zwölf Komma fünf Sekunden.", "de"),
780        ("Es gab 50 Soldaten.", "Es gab fünfzig Soldaten.", "de"),
781        ("Dies ist ein 1. Test", "Dies ist ein erste Test", "de"),  # Issue with gender
782        ("Das macht $20 Herr.", "Das macht zwanzig Dollar Herr.", "de"),
783        ("Das macht 20€ Herr.", "Das macht zwanzig Euro Herr.", "de"),
784        (
785            "Das macht 20,15€ Herr.",
786            "Das macht zwanzig Euro und fünfzehn Cent Herr.",
787            "de",
788        ),
789        # Spanish
790        ("En 12,5 segundos.", "En doce punto cinco segundos.", "es"),
791        ("Había 50 soldados.", "Había cincuenta soldados.", "es"),
792        ("Este es un 1er test", "Este es un primero test", "es"),
793        ("Eso le costará $20 señor.", "Eso le costará veinte dólares señor.", "es"),
794        ("Eso le costará 20€ señor.", "Eso le costará veinte euros señor.", "es"),
795        (
796            "Eso le costará 20,15€ señor.",
797            "Eso le costará veinte euros con quince céntimos señor.",
798            "es",
799        ),
800        # Italian
801        ("In 12,5 secondi.", "In dodici virgola cinque secondi.", "it"),
802        ("C'erano 50 soldati.", "C'erano cinquanta soldati.", "it"),
803        ("Questo è un 1° test", "Questo è un primo test", "it"),
804        ("Ti costerà $20 signore.", "Ti costerà venti dollari signore.", "it"),
805        ("Ti costerà 20€ signore.", "Ti costerà venti euro signore.", "it"),
806        (
807            "Ti costerà 20,15€ signore.",
808            "Ti costerà venti euro e quindici centesimi signore.",
809            "it",
810        ),
811        # Portuguese
812        ("Em 12,5 segundos.", "Em doze vírgula cinco segundos.", "pt"),
813        ("Havia 50 soldados.", "Havia cinquenta soldados.", "pt"),
814        ("Este é um 1º teste", "Este é um primeiro teste", "pt"),
815        ("Isso custará $20 senhor.", "Isso custará vinte dólares senhor.", "pt"),
816        ("Isso custará 20€ senhor.", "Isso custará vinte euros senhor.", "pt"),
817        (
818            "Isso custará 20,15€ senhor.",
819            "Isso custará vinte euros e quinze cêntimos senhor.",
820            "pt",
821        ),  # "cêntimos" should be "centavos" num2words issue
822        # Polish
823        ("W 12,5 sekundy.", "W dwanaście przecinek pięć sekundy.", "pl"),
824        ("Było 50 żołnierzy.", "Było pięćdziesiąt żołnierzy.", "pl"),
825        (
826            "To będzie kosztować 20€ panie.",
827            "To będzie kosztować dwadzieścia euro panie.",
828            "pl",
829        ),
830        (
831            "To będzie kosztować 20,15€ panie.",
832            "To będzie kosztować dwadzieścia euro, piętnaście centów panie.",
833            "pl",
834        ),
835        # Arabic
836        ("في الـ 12,5 ثانية.", "في الـ اثنا عشر  , خمسون ثانية.", "ar"),
837        ("كان هناك 50 جنديًا.", "كان هناك خمسون جنديًا.", "ar"),
838        # ("ستكون النتيجة $20 يا سيد.", 'ستكون النتيجة عشرون دولار يا سيد.', 'ar'), # $ and € are mising from num2words
839        # ("ستكون النتيجة 20€ يا سيد.", 'ستكون النتيجة عشرون يورو يا سيد.', 'ar'),
840        # Czech
841        ("Za 12,5 vteřiny.", "Za dvanáct celá pět vteřiny.", "cs"),
842        ("Bylo tam 50 vojáků.", "Bylo tam padesát vojáků.", "cs"),
843        ("To bude stát 20€ pane.", "To bude stát dvacet euro pane.", "cs"),
844        ("To bude 20.15€ pane.", "To bude dvacet euro, patnáct centů pane.", "cs"),
845        # Russian
846        ("Через 12.5 секунды.", "Через двенадцать запятая пять секунды.", "ru"),
847        ("Там было 50 солдат.", "Там было пятьдесят солдат.", "ru"),
848        (
849            "Это будет 20.15€ сэр.",
850            "Это будет двадцать евро, пятнадцать центов сэр.",
851            "ru",
852        ),
853        (
854            "Это будет стоить 20€ господин.",
855            "Это будет стоить двадцать евро господин.",
856            "ru",
857        ),
858        # Dutch
859        ("In 12,5 seconden.", "In twaalf komma vijf seconden.", "nl"),
860        ("Er waren 50 soldaten.", "Er waren vijftig soldaten.", "nl"),
861        ("Dat wordt dan $20 meneer.", "Dat wordt dan twintig dollar meneer.", "nl"),
862        ("Dat wordt dan 20€ meneer.", "Dat wordt dan twintig euro meneer.", "nl"),
863        # Chinese (Simplified)
864        ("在12.5秒内", "在十二点五秒内", "zh"),
865        ("有50名士兵", "有五十名士兵", "zh"),
866        # ("那将是$20先生", '那将是二十美元先生', 'zh'), currency doesn't work
867        # ("那将是20€先生", '那将是二十欧元先生', 'zh'),
868        # Turkish
869        # ("12,5 saniye içinde.", 'On iki virgül beş saniye içinde.', 'tr'), # decimal doesn't work for TR
870        ("50 asker vardı.", "elli asker vardı.", "tr"),
871        ("Bu 1. test", "Bu birinci test", "tr"),
872        # ("Bu 100.000,5.", 'Bu yüz bin virgül beş.', 'tr'),
873        # Hungarian
874        ("12,5 másodperc alatt.", "tizenkettő egész öt tized másodperc alatt.", "hu"),
875        ("50 katona volt.", "ötven katona volt.", "hu"),
876        ("Ez az 1. teszt", "Ez az első teszt", "hu"),
877        # Korean
878        ("12.5 초 안에.", "십이 점 다섯 초 안에.", "ko"),
879        ("50 명의 병사가 있었다.", "오십 명의 병사가 있었다.", "ko"),
880        ("이것은 1 번째 테스트입니다", "이것은 첫 번째 테스트입니다", "ko"),
881    ]
882    for a, b, lang in test_cases:
883        out = expand_numbers_multilingual(a, lang=lang)
884        assert out == b, f"'{out}' vs '{b}'"

def test_abbreviations_multilingual(): View Source

887def test_abbreviations_multilingual():
888    test_cases = [
889        # English
890        ("Hello Mr. Smith.", "Hello mister Smith.", "en"),
891        ("Dr. Jones is here.", "doctor Jones is here.", "en"),
892        # Spanish
893        ("Hola Sr. Garcia.", "Hola señor Garcia.", "es"),
894        ("La Dra. Martinez es muy buena.", "La doctora Martinez es muy buena.", "es"),
895        # French
896        ("Bonjour Mr. Dupond.", "Bonjour monsieur Dupond.", "fr"),
897        (
898            "Mme. Moreau est absente aujourd'hui.",
899            "madame Moreau est absente aujourd'hui.",
900            "fr",
901        ),
902        # German
903        ("Frau Dr. Müller ist sehr klug.", "Frau doktor Müller ist sehr klug.", "de"),
904        # Portuguese
905        ("Olá Sr. Silva.", "Olá senhor Silva.", "pt"),
906        (
907            "Dra. Costa, você está disponível?",
908            "doutora Costa, você está disponível?",
909            "pt",
910        ),
911        # Italian
912        ("Buongiorno, Sig. Rossi.", "Buongiorno, signore Rossi.", "it"),
913        # ("Sig.ra Bianchi, posso aiutarti?", 'signora Bianchi, posso aiutarti?', 'it'), # Issue with matching that pattern
914        # Polish
915        ("Dzień dobry, P. Kowalski.", "Dzień dobry, pani Kowalski.", "pl"),
916        (
917            "M. Nowak, czy mogę zadać pytanie?",
918            "pan Nowak, czy mogę zadać pytanie?",
919            "pl",
920        ),
921        # Czech
922        ("P. Novák", "pan Novák", "cs"),
923        ("Dr. Vojtěch", "doktor Vojtěch", "cs"),
924        # Dutch
925        ("Dhr. Jansen", "de heer Jansen", "nl"),
926        ("Mevr. de Vries", "mevrouw de Vries", "nl"),
927        # Russian
928        ("Здравствуйте Г-н Иванов.", "Здравствуйте господин Иванов.", "ru"),
929        (
930            "Д-р Смирнов здесь, чтобы увидеть вас.",
931            "доктор Смирнов здесь, чтобы увидеть вас.",
932            "ru",
933        ),
934        # Turkish
935        ("Merhaba B. Yılmaz.", "Merhaba bay Yılmaz.", "tr"),
936        ("Dr. Ayşe burada.", "doktor Ayşe burada.", "tr"),
937        # Hungarian
938        ("Dr. Szabó itt van.", "doktor Szabó itt van.", "hu"),
939    ]
940
941    for a, b, lang in test_cases:
942        out = expand_abbreviations_multilingual(a, lang=lang)
943        assert out == b, f"'{out}' vs '{b}'"

def test_symbols_multilingual(): View Source

946def test_symbols_multilingual():
947    test_cases = [
948        ("I have 14% battery", "I have 14 percent battery", "en"),
949        ("Te veo @ la fiesta", "Te veo arroba la fiesta", "es"),
950        ("J'ai 14° de fièvre", "J'ai 14 degrés de fièvre", "fr"),
951        ("Die Rechnung beträgt £ 20", "Die Rechnung beträgt pfund 20", "de"),
952        (
953            "O meu email é ana&joao@gmail.com",
954            "O meu email é ana e joao arroba gmail.com",
955            "pt",
956        ),
957        (
958            "linguaggio di programmazione C#",
959            "linguaggio di programmazione C cancelletto",
960            "it",
961        ),
962        ("Moja temperatura to 36.6°", "Moja temperatura to 36.6 stopnie", "pl"),
963        ("Mám 14% baterie", "Mám 14 procento baterie", "cs"),
964        ("Těším se na tebe @ party", "Těším se na tebe na party", "cs"),
965        ("У меня 14% заряда", "У меня 14 процентов заряда", "ru"),
966        ("Я буду @ дома", "Я буду собака дома", "ru"),
967        ("Ik heb 14% batterij", "Ik heb 14 procent batterij", "nl"),
968        ("Ik zie je @ het feest", "Ik zie je bij het feest", "nl"),
969        ("لدي 14% في البطارية", "لدي 14 في المئة في البطارية", "ar"),
970        ("我的电量为 14%", "我的电量为 14 百分之", "zh"),
971        ("Pilim %14 dolu.", "Pilim yüzde 14 dolu.", "tr"),
972        (
973            "Az akkumulátorom töltöttsége 14%",
974            "Az akkumulátorom töltöttsége 14 százalék",
975            "hu",
976        ),
977        ("배터리 잔량이 14%입니다.", "배터리 잔량이 14 퍼센트입니다.", "ko"),
978    ]
979
980    for a, b, lang in test_cases:
981        out = expand_symbols_multilingual(a, lang=lang)
982        assert out == b, f"'{out}' vs '{b}'"