Files
Fay/tts/ms_tts_sdk.py
xszyou 0df7a26e8e fay进化
1. 内置RAG知识库(请把docx、pptx、txt文件存放到llm/data目录);
2. 流式回复逻辑优化;
3. 语音交互逻辑优化;
4. 线程安全增强;
5. 数字人驱动接口增加流式输出开始结束标记;
6. 修复因记忆反思而导致的记忆混乱,无法多轮对话问题;
7. 修复mcp工具获取于调用的线程同步问题;
8. 修复funasr依赖版本问题。
2025-06-27 23:33:24 +08:00

133 lines
5.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import time
import asyncio
import azure.cognitiveservices.speech as speechsdk
import asyncio
from tts import tts_voice
from tts.tts_voice import EnumVoice
from utils import util, config_util
from utils import config_util as cfg
import edge_tts
from pydub import AudioSegment
class Speech:
def __init__(self):
self.ms_tts = False
voice_type = tts_voice.get_voice_of(config_util.config["attribute"]["voice"] if config_util.config["attribute"]["voice"] is not None and config_util.config["attribute"]["voice"].strip() != "" else "晓晓(edge)")
voice_name = EnumVoice.XIAO_XIAO.value["voiceName"]
if voice_type is not None:
voice_name = voice_type.value["voiceName"]
if config_util.key_ms_tts_key and config_util.key_ms_tts_key is not None and config_util.key_ms_tts_key.strip() != "":
self.__speech_config = speechsdk.SpeechConfig(subscription=cfg.key_ms_tts_key, region=cfg.key_ms_tts_region)
self.__speech_config.speech_recognition_language = "zh-CN"
self.__speech_config.speech_synthesis_voice_name = voice_name
self.__speech_config.set_speech_synthesis_output_format(speechsdk.SpeechSynthesisOutputFormat.Riff16Khz16BitMonoPcm)
self.__synthesizer = speechsdk.SpeechSynthesizer(speech_config=self.__speech_config, audio_config=None)
self.ms_tts = True
self.__connection = None
self.__history_data = []
def __get_history(self, voice_name, style, text):
for data in self.__history_data:
if data[0] == voice_name and data[1] == style and data[2] == text:
return data[3]
return None
def connect(self):
if self.ms_tts:
self.__connection = speechsdk.Connection.from_speech_synthesizer(self.__synthesizer)
self.__connection.open(True)
util.log(1, "TTS 服务已经连接!")
def close(self):
if self.__connection is not None:
self.__connection.close()
#生成mp3音频
async def get_edge_tts(self,text,voice,file_url) -> None:
communicate = edge_tts.Communicate(text, voice)
await communicate.save(file_url)
def convert_mp3_to_wav(self, mp3_filepath):
audio = AudioSegment.from_mp3(mp3_filepath)
# 使用 set_frame_rate 方法设置采样率
audio = audio.set_frame_rate(44100)
wav_filepath = mp3_filepath.rsplit(".", 1)[0] + ".wav"
audio.export(wav_filepath, format="wav")
return wav_filepath
"""
文字转语音
:param text: 文本信息
:param style: 说话风格、语气
:returns: 音频文件路径
"""
def to_sample(self, text, style):
if self.ms_tts:
voice_type = tts_voice.get_voice_of(config_util.config["attribute"]["voice"] if config_util.config["attribute"]["voice"] is not None and config_util.config["attribute"]["voice"].strip() != "" else "晓晓(edge)")
voice_name = EnumVoice.XIAO_XIAO.value["voiceName"]
if voice_type is not None:
voice_name = voice_type.value["voiceName"]
history = self.__get_history(voice_name, style, text)
if history is not None:
return history
ssml = '<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="zh-CN">' \
'<voice name="{}">' \
'<mstts:express-as style="{}" styledegree="{}">' \
'{}' \
'</mstts:express-as>' \
'</voice>' \
'</speak>'.format(voice_name, style, 1.8, "<break time='0.2s'/>" + text)
result = self.__synthesizer.speak_text_async(text).get()
# result = self.__synthesizer.speak_ssml(ssml)#感觉使用sepak_text_async要快很多
audio_data_stream = speechsdk.AudioDataStream(result)
file_url = './samples/sample-' + str(int(time.time() * 1000)) + '.wav'
audio_data_stream.save_to_wav_file(file_url)
if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
wav_url = file_url
self.__history_data.append((voice_name, style, text, wav_url))
return wav_url
else:
util.log(1, "[x] 语音转换失败!")
util.log(1, "[x] 原因: " + str(result.reason))
return None
else:
voice_type = tts_voice.get_voice_of(config_util.config["attribute"]["voice"])
voice_name = EnumVoice.XIAO_XIAO.value["voiceName"]
if voice_type is not None:
voice_name = voice_type.value["voiceName"]
history = self.__get_history(voice_name, style, text)
if history is not None:
return history
ssml = '<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="zh-CN">' \
'<voice name="{}">' \
'<mstts:express-as style="{}" styledegree="{}">' \
'{}' \
'</mstts:express-as>' \
'</voice>' \
'</speak>'.format(voice_name, style, 1.8, text)
try:
file_url = './samples/sample-' + str(int(time.time() * 1000)) + '.mp3'
asyncio.new_event_loop().run_until_complete(self.get_edge_tts(text,voice_name,file_url))
wav_url = self.convert_mp3_to_wav(file_url)
self.__history_data.append((voice_name, style, text, wav_url))
except Exception as e :
util.log(1, "[x] 语音转换失败!")
util.log(1, "[x] 原因: " + str(str(e)))
wav_url = None
return wav_url
if __name__ == '__main__':
cfg.load_config()
sp = Speech()
sp.connect()
text = "我叫Fay,我今年18岁很年青。"
s = sp.to_sample(text, "cheerful")
print(s)
sp.close()