Spaces:
Runtime error
Runtime error
| import os, logging, datetime, json, random | |
| import gradio as gr | |
| import numpy as np | |
| import torch | |
| import re_matching | |
| import utils | |
| from infer import infer, latest_version, get_net_g, infer_multilang | |
| import gradio as gr | |
| from config import config | |
| from tools.webui import reload_javascript, get_character_html | |
| from tools.sentence import split_by_language | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='[%(levelname)s|%(asctime)s]%(message)s', | |
| datefmt='%Y-%m-%d %H:%M:%S' | |
| ) | |
| device = config.webui_config.device | |
| if device == "mps": | |
| os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" | |
| hps = utils.get_hparams_from_file(config.webui_config.config_path) | |
| version = hps.version if hasattr(hps, "version") else latest_version | |
| net_g = get_net_g(model_path=config.webui_config.model, version=version, device=device, hps=hps) | |
| with open("./css/style.css", "r", encoding="utf-8") as f: | |
| customCSS = f.read() | |
| with open("./assets/lines.json", "r", encoding="utf-8") as f: | |
| full_lines = json.load(f) | |
| def speak_fn( | |
| text: str, | |
| exceed_flag, | |
| speaker="TalkFlower_CNzh", | |
| sdp_ratio=0.2, # SDP/DP混合比 | |
| noise_scale=0.6, # 感情 | |
| noise_scale_w=0.6, # 音素长度 | |
| length_scale=0.9, # 语速 | |
| language="ZH", | |
| reference_audio=None, | |
| emotion=4, | |
| interval_between_para=0.2, # 段间间隔 | |
| interval_between_sent=1, # 句间间隔 | |
| ): | |
| if speaker == "Chinese": speaker = "TalkFlower_CNzh" | |
| elif speaker == "English": speaker = "TalkFlower_USen" | |
| elif speaker == "Japanese": speaker = "TalkFlower_JPja" | |
| else: speaker = "TalkFlower_CNzh" | |
| audio_list = [] | |
| while text.find("\n\n") != -1: | |
| text = text.replace("\n\n", "\n") | |
| if len(text) > 512: | |
| logging.info(f"Too Long Text: {text}") | |
| if speaker == "TalkFlower_CNzh": | |
| text = "这句太长了,憋坏我啦!" | |
| audio_value = "./assets/audios/overlength.wav" | |
| elif speaker == "TalkFlower_USen": | |
| text = "This sentence is too long!" | |
| audio_value = "./assets/audios/overlength_en.wav" | |
| elif speaker == "TalkFlower_JPja": | |
| text = "この文は長すぎます!" | |
| audio_value = "./assets/audios/overlength_ja.wav" | |
| exceed_flag = not exceed_flag | |
| else: | |
| for idx, slice in enumerate(text.split("|")): | |
| if slice == "": | |
| continue | |
| skip_start = idx != 0 | |
| skip_end = idx != len(text.split("|")) - 1 | |
| sentences_list = split_by_language( | |
| slice, target_languages=["zh", "ja", "en"] | |
| ) | |
| idx = 0 | |
| while idx < len(sentences_list): | |
| text_to_generate = [] | |
| lang_to_generate = [] | |
| while True: | |
| content, lang = sentences_list[idx] | |
| temp_text = [content] | |
| lang = lang.upper() | |
| if lang == "JA": | |
| lang = "JP" | |
| if len(text_to_generate) > 0: | |
| text_to_generate[-1] += [temp_text.pop(0)] | |
| lang_to_generate[-1] += [lang] | |
| if len(temp_text) > 0: | |
| text_to_generate += [[i] for i in temp_text] | |
| lang_to_generate += [[lang]] * len(temp_text) | |
| if idx + 1 < len(sentences_list): | |
| idx += 1 | |
| else: | |
| break | |
| skip_start = (idx != 0) and skip_start | |
| skip_end = (idx != len(sentences_list) - 1) and skip_end | |
| logging.info(f"{speaker[-4:]}: {text_to_generate}{lang_to_generate}") | |
| with torch.no_grad(): | |
| for i, piece in enumerate(text_to_generate): | |
| skip_start = (i != 0) and skip_start | |
| skip_end = (i != len(text_to_generate) - 1) and skip_end | |
| audio = infer_multilang( | |
| piece, | |
| reference_audio=reference_audio, | |
| emotion=emotion, | |
| sdp_ratio=sdp_ratio, | |
| noise_scale=noise_scale, | |
| noise_scale_w=noise_scale_w, | |
| length_scale=length_scale, | |
| sid=speaker, | |
| language=lang_to_generate[i], | |
| hps=hps, | |
| net_g=net_g, | |
| device=device, | |
| skip_start=skip_start, | |
| skip_end=skip_end, | |
| ) | |
| audio16bit = gr.processing_utils.convert_to_16_bit_wav(audio) | |
| audio_list.append(audio16bit) | |
| idx += 1 | |
| # 单一语言推理 | |
| # if len(text) > 42: | |
| # logging.info(f"Long Text: {text}") | |
| # para_list = re_matching.cut_para(text) | |
| # for p in para_list: | |
| # audio_list_sent = [] | |
| # sent_list = re_matching.cut_sent(p) | |
| # for s in sent_list: | |
| # audio = infer( | |
| # s, | |
| # sdp_ratio=sdp_ratio, | |
| # noise_scale=noise_scale, | |
| # noise_scale_w=noise_scale_w, | |
| # length_scale=length_scale, | |
| # sid=speaker, | |
| # language=language, | |
| # hps=hps, | |
| # net_g=net_g, | |
| # device=device, | |
| # reference_audio=reference_audio, | |
| # emotion=emotion, | |
| # ) | |
| # audio_list_sent.append(audio) | |
| # silence = np.zeros((int)(44100 * interval_between_sent)) | |
| # audio_list_sent.append(silence) | |
| # if (interval_between_para - interval_between_sent) > 0: | |
| # silence = np.zeros((int)(44100 * (interval_between_para - interval_between_sent))) | |
| # audio_list_sent.append(silence) | |
| # audio16bit = gr.processing_utils.convert_to_16_bit_wav(np.concatenate(audio_list_sent)) # 对完整句子做音量归一 | |
| # audio_list.append(audio16bit) | |
| # else: | |
| # logging.info(f"Short Text: {text}") | |
| # silence = np.zeros(hps.data.sampling_rate // 2, dtype=np.int16) | |
| # with torch.no_grad(): | |
| # for piece in text.split("|"): | |
| # audio = infer( | |
| # piece, | |
| # sdp_ratio=sdp_ratio, | |
| # noise_scale=noise_scale, | |
| # noise_scale_w=noise_scale_w, | |
| # length_scale=length_scale, | |
| # sid=speaker, | |
| # language=language, | |
| # hps=hps, | |
| # net_g=net_g, | |
| # device=device, | |
| # reference_audio=reference_audio, | |
| # emotion=emotion, | |
| # ) | |
| # audio16bit = gr.processing_utils.convert_to_16_bit_wav(audio) | |
| # audio_list.append(audio16bit) | |
| # audio_list.append(silence) # 将静音添加到列表中 | |
| audio_concat = np.concatenate(audio_list) | |
| audio_value = (hps.data.sampling_rate, audio_concat) | |
| return gr.update(value=audio_value, autoplay=True), get_character_html(text), exceed_flag, gr.update(interactive=True) | |
| def submit_lock_fn(): | |
| return gr.update(interactive=False) | |
| def init_fn(): | |
| gr.Info("2023-11-27: 支持多语言(中、英、日);支持更换音色! Support Chinese, English, Japanese; Support changing voices!") | |
| # gr.Info("2023-11-24: 优化长句生成效果;增加示例;更新了一些小彩蛋;画了一些大饼)") | |
| # gr.Info("Support languages: Chinese, English, Japanese. 欢迎在 Community 中提建议~") | |
| index = random.randint(1,7) | |
| welcome_text = get_sentence("Welcome", index) | |
| return get_character_html(welcome_text) #gr.update(value=f"./assets/audios/Welcome{index}.wav", autoplay=False), | |
| def get_sentence(category, index=-1): | |
| if index == -1: | |
| index = random.randint(1, len(full_lines[category])) | |
| return full_lines[category][f"{index}"] | |