-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtts.py
94 lines (82 loc) · 3.92 KB
/
tts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#file -- TTSService.py --
import os
import nltk
import torch
import warnings
import numpy as np
from transformers import AutoProcessor, BarkModel
os.environ["SUNO_OFFLOAD_CPU"] = "True"
os.environ["SUNO_USE_SMALL_MODELS"] = "True"
warnings.filterwarnings(
"ignore",
message="torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.",
)
# Print iterations progress
def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█', printEnd = "\r"):
"""
Call in a loop to create terminal progress bar
@params:
iteration - Required : current iteration (Int)
total - Required : total iterations (Int)
prefix - Optional : prefix string (Str)
suffix - Optional : suffix string (Str)
decimals - Optional : positive number of decimals in percent complete (Int)
length - Optional : character length of bar (Int)
fill - Optional : bar fill character (Str)
printEnd - Optional : end character (e.g. "\r", "\r\n") (Str)
"""
percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
filledLength = int(length * iteration // total)
bar = fill * filledLength + '-' * (length - filledLength)
print(f'\r{prefix} |{bar}| {percent}% {suffix}', end = printEnd)
# Print New Line on Complete
if iteration == total:
print()
class TextToSpeechService:
def __init__(self, device: str = "cuda" if torch.cuda.is_available() else "cpu"):
"""
Initializes the TextToSpeechService class.
Args:
device (str, optional): The device to be used for the model, either "cuda" if a GPU is available or "cpu".
Defaults to "cuda" if available, otherwise "cpu".
"""
self.device = device
self.processor = AutoProcessor.from_pretrained("suno/bark-small")
self.model = BarkModel.from_pretrained("suno/bark-small")
self.model.to(self.device)
def synthesize(self, text: str, voice_preset: str = "v2/en_speaker_9"):
"""
Synthesizes audio from the given text using the specified voice preset.
Args:
text (str): The input text to be synthesized.
voice_preset (str, optional): The voice preset to be used for the synthesis. Defaults to "v2/en_speaker_9".
Returns:
tuple: A tuple containing the sample rate and the generated audio array.
"""
inputs = self.processor(text, voice_preset=voice_preset, return_tensors="pt")
inputs = {k: v.to(self.device) for k, v in inputs.items()}
with torch.no_grad():
audio_array = self.model.generate(**inputs, pad_token_id=10000)
audio_array = audio_array.cpu().numpy().squeeze()
sample_rate = self.model.generation_config.sample_rate
return sample_rate, audio_array
def long_form_synthesize(self, text: str, voice_preset: str = "v2/en_speaker_9"):
"""
Synthesizes audio from the given long-form text using the specified voice preset.
Args:
text (str): The input text to be synthesized.
voice_preset (str, optional): The voice preset to be used for the synthesis. Defaults to "v2/en_speaker_9".
Returns:
tuple: A tuple containing the sample rate and the generated audio array.
"""
pieces = []
sentences = nltk.sent_tokenize(text)
silence = np.zeros(int(0.25 * self.model.generation_config.sample_rate))
for sent in sentences:
sample_rate, audio_array = self.synthesize(sent, voice_preset)
pieces += [audio_array, silence.copy()]
return self.model.generation_config.sample_rate, np.concatenate(pieces)
##TEST, but returns NumPy array so it needs to connect to something else
#speech = TextToSpeechService()
#result = speech.synthesize("Hello world.")
#print(result)