idiap · ghost · Jul 18, 2024 · Jul 18, 2024 · Jul 18, 2024 · Jul 18, 2024
diff --git a/TTS/tts/layers/xtts/tokenizer.py b/TTS/tts/layers/xtts/tokenizer.py
@@ -1,10 +1,12 @@
-import logging
 import os
 import re
 import textwrap
 from functools import cached_property
 
+import pypinyin
 import torch
+from hangul_romanize import Transliter
+from hangul_romanize.rule import academic
 from num2words import num2words
 from spacy.lang.ar import Arabic
 from spacy.lang.en import English
@@ -16,11 +18,8 @@
 
 from TTS.tts.layers.xtts.zh_num2words import TextNorm as zh_num2words
 
-logger = logging.getLogger(__name__)
-
 
 def get_spacy_lang(lang):
-    """Return Spacy language used for sentence splitting."""
     if lang == "zh":
         return Chinese()
     elif lang == "ja":
@@ -32,7 +31,7 @@ def get_spacy_lang(lang):
     elif lang == "hi":
         return Hindi()
     else:
-        # For most languages, English does the job
+        # For most languages, Enlish does the job
         return English()
 
 
@@ -233,6 +232,12 @@ def split_sentence(text, lang, text_split_length=250):
             # Korean doesn't typically use abbreviations in the same way as Latin-based scripts.
         ]
     ],
+    "hi": [
+        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        for x in [
+            # Hindi doesn't typically use abbreviations in the same way as Latin-based scripts.
+        ]
+    ],
 }
 
 
@@ -429,6 +434,18 @@ def expand_abbreviations_multilingual(text, lang="en"):
             ("°", " 도 "),
         ]
     ],
+    "hi": [
+        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        for x in [
+            ("&", " और "),
+            ("@", " ऐट दी रेट "),
+            ("%", " प्रतिशत "),
+            ("#", " हैश "),
+            ("$", " डॉलर "),
+            ("£", " पाउंड "),
+            ("°", " डिग्री "),
+        ]
+    ],
 }
 
 
@@ -454,6 +471,7 @@ def expand_symbols_multilingual(text, lang="en"):
     "tr": re.compile(r"([0-9]+)(\.|inci|nci|uncu|üncü|\.)"),
     "hu": re.compile(r"([0-9]+)(\.|adik|edik|odik|edik|ödik|ödike|ik)"),
     "ko": re.compile(r"([0-9]+)(번째|번|차|째)"),
+        "hi": re.compile(r"([0-9]+)(st|nd|rd|th)") # To check
 }
 _number_re = re.compile(r"[0-9]+")
 _currency_re = {
@@ -478,6 +496,9 @@ def _remove_dots(m):
     text = m.group(0)
     if "." in text:
         text = text.replace(".", "")
+    # For Hindi
+    elif "।" in text:
+        text = text.replace("।", "")
     return text
 
 
@@ -505,6 +526,7 @@ def _expand_currency(m, lang="en", currency="USD"):
         "tr": ", ",
         "hu": ", ",
         "ko": ", ",
+        "hi": ", ",
     }
 
     if amount.is_integer():
@@ -574,10 +596,6 @@ def basic_cleaners(text):
 
 
 def chinese_transliterate(text):
-    try:
-        import pypinyin
-    except ImportError as e:
-        raise ImportError("Chinese requires: pypinyin") from e
     return "".join(
         [p[0] for p in pypinyin.pinyin(text, style=pypinyin.Style.TONE3, heteronym=False, neutral_tone_with_five=True)]
     )
@@ -590,11 +608,6 @@ def japanese_cleaners(text, katsu):
 
 
 def korean_transliterate(text):
-    try:
-        from hangul_romanize import Transliter
-        from hangul_romanize.rule import academic
-    except ImportError as e:
-        raise ImportError("Korean requires: hangul_romanize") from e
     r = Transliter(academic)
     return r.translit(text)
 
@@ -637,14 +650,12 @@ def check_input_length(self, txt, lang):
         lang = lang.split("-")[0]  # remove the region
         limit = self.char_limits.get(lang, 250)
         if len(txt) > limit:
-            logger.warning(
-                "The text length exceeds the character limit of %d for language '%s', this might cause truncated audio.",
-                limit,
-                lang,
+            print(
+                f"[!] Warning: The text length exceeds the character limit of {limit} for language '{lang}', this might cause truncated audio."
             )
 
     def preprocess_text(self, txt, lang):
-        if lang in {"ar", "cs", "de", "en", "es", "fr", "hu", "it", "nl", "pl", "pt", "ru", "tr", "zh", "ko"}:
+        if lang in {"ar", "cs", "de", "en", "es", "fr", "hu", "it", "nl", "pl", "pt", "ru", "tr", "zh", "ko", "hi"}:
             txt = multilingual_cleaners(txt, lang)
             if lang == "zh":
                 txt = chinese_transliterate(txt)
@@ -777,6 +788,9 @@ def test_expand_numbers_multilingual():
         ("12.5 초 안에.", "십이 점 다섯 초 안에.", "ko"),
         ("50 명의 병사가 있었다.", "오십 명의 병사가 있었다.", "ko"),
         ("이것은 1 번째 테스트입니다", "이것은 첫 번째 테스트입니다", "ko"),
+        # Hindi
+        ("12.5 सेकंड में।", "साढ़े बारह सेकंड में।", "hi"),
+        ("50 सैनिक थे।", "पचास सैनिक थे।", "hi"),
     ]
     for a, b, lang in test_cases:
         out = expand_numbers_multilingual(a, lang=lang)
@@ -846,6 +860,7 @@ def test_symbols_multilingual():
         ("Pilim %14 dolu.", "Pilim yüzde 14 dolu.", "tr"),
         ("Az akkumulátorom töltöttsége 14%", "Az akkumulátorom töltöttsége 14 százalék", "hu"),
         ("배터리 잔량이 14%입니다.", "배터리 잔량이 14 퍼센트입니다.", "ko"),
+        ("मेरे पास 14% बैटरी है।", "मेरे पास चौदह प्रतिशत बैटरी है।",  "hi"),
     ]
 
     for a, b, lang in test_cases: