add voice clone
This commit is contained in:
parent
2e1a768d03
commit
83a9acb371
95
app.py
95
app.py
|
@ -22,10 +22,15 @@ from nerfreal import NeRFReal
|
||||||
import shutil
|
import shutil
|
||||||
import asyncio
|
import asyncio
|
||||||
import edge_tts
|
import edge_tts
|
||||||
|
from typing import Iterator
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
app = Flask(__name__)
|
app = Flask(__name__)
|
||||||
sockets = Sockets(app)
|
sockets = Sockets(app)
|
||||||
global nerfreal
|
global nerfreal
|
||||||
|
global tts_type
|
||||||
|
global gspeaker
|
||||||
|
|
||||||
|
|
||||||
async def main(voicename: str, text: str, render):
|
async def main(voicename: str, text: str, render):
|
||||||
|
@ -39,22 +44,63 @@ async def main(voicename: str, text: str, render):
|
||||||
elif chunk["type"] == "WordBoundary":
|
elif chunk["type"] == "WordBoundary":
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def llm_response(message):
|
def get_speaker(ref_audio,server_url):
|
||||||
from llm.LLM import LLM
|
files = {"wav_file": ("reference.wav", open(ref_audio, "rb"))}
|
||||||
# llm = LLM().init_model('Gemini', model_path= 'gemini-pro',api_key='Your API Key', proxy_url=None)
|
response = requests.post(f"{server_url}/clone_speaker", files=files)
|
||||||
llm = LLM().init_model('ChatGPT', model_path= 'gpt-3.5-turbo',api_key='Your API Key')
|
return response.json()
|
||||||
response = llm.chat(message)
|
|
||||||
print(response)
|
def xtts(text, speaker, language, server_url, stream_chunk_size) -> Iterator[bytes]:
|
||||||
return response
|
start = time.perf_counter()
|
||||||
|
speaker["text"] = text
|
||||||
|
speaker["language"] = language
|
||||||
|
speaker["stream_chunk_size"] = stream_chunk_size # you can reduce it to get faster response, but degrade quality
|
||||||
|
res = requests.post(
|
||||||
|
f"{server_url}/tts_stream",
|
||||||
|
json=speaker,
|
||||||
|
stream=True,
|
||||||
|
)
|
||||||
|
end = time.perf_counter()
|
||||||
|
print(f"xtts Time to make POST: {end-start}s")
|
||||||
|
|
||||||
|
if res.status_code != 200:
|
||||||
|
print("Error:", res.text)
|
||||||
|
return
|
||||||
|
|
||||||
|
first = True
|
||||||
|
for chunk in res.iter_content(chunk_size=960):
|
||||||
|
if first:
|
||||||
|
end = time.perf_counter()
|
||||||
|
print(f"xtts Time to first chunk: {end-start}s")
|
||||||
|
first = False
|
||||||
|
if chunk:
|
||||||
|
yield chunk
|
||||||
|
|
||||||
|
print("xtts response.elapsed:", res.elapsed)
|
||||||
|
|
||||||
|
def stream_xtts(audio_stream,render):
|
||||||
|
for chunk in audio_stream:
|
||||||
|
if chunk is not None:
|
||||||
|
render.push_audio(chunk)
|
||||||
|
|
||||||
def txt_to_audio(text_):
|
def txt_to_audio(text_):
|
||||||
audio_list = []
|
if tts_type == "edgetts":
|
||||||
#audio_path = 'data/audio/aud_0.wav'
|
|
||||||
voicename = "zh-CN-YunxiaNeural"
|
voicename = "zh-CN-YunxiaNeural"
|
||||||
text = text_
|
text = text_
|
||||||
t = time.time()
|
t = time.time()
|
||||||
asyncio.get_event_loop().run_until_complete(main(voicename,text,nerfreal))
|
asyncio.get_event_loop().run_until_complete(main(voicename,text,nerfreal))
|
||||||
print(f'-------tts time:{time.time()-t:.4f}s')
|
print(f'-------edge tts time:{time.time()-t:.4f}s')
|
||||||
|
else: #xtts
|
||||||
|
stream_xtts(
|
||||||
|
xtts(
|
||||||
|
text_,
|
||||||
|
gspeaker,
|
||||||
|
"zh-cn", #en args.language,
|
||||||
|
"http://localhost:9000", #args.server_url,
|
||||||
|
"20" #args.stream_chunk_size
|
||||||
|
),
|
||||||
|
nerfreal
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@sockets.route('/humanecho')
|
@sockets.route('/humanecho')
|
||||||
def echo_socket(ws):
|
def echo_socket(ws):
|
||||||
|
@ -75,6 +121,15 @@ def echo_socket(ws):
|
||||||
else:
|
else:
|
||||||
txt_to_audio(message)
|
txt_to_audio(message)
|
||||||
|
|
||||||
|
|
||||||
|
def llm_response(message):
|
||||||
|
from llm.LLM import LLM
|
||||||
|
# llm = LLM().init_model('Gemini', model_path= 'gemini-pro',api_key='Your API Key', proxy_url=None)
|
||||||
|
llm = LLM().init_model('ChatGPT', model_path= 'gpt-3.5-turbo',api_key='Your API Key')
|
||||||
|
response = llm.chat(message)
|
||||||
|
print(response)
|
||||||
|
return response
|
||||||
|
|
||||||
@sockets.route('/humanchat')
|
@sockets.route('/humanchat')
|
||||||
def chat_socket(ws):
|
def chat_socket(ws):
|
||||||
# 获取WebSocket对象
|
# 获取WebSocket对象
|
||||||
|
@ -103,6 +158,7 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument('--pose', type=str, default="data/data_kf.json", help="transforms.json, pose source")
|
parser.add_argument('--pose', type=str, default="data/data_kf.json", help="transforms.json, pose source")
|
||||||
|
parser.add_argument('--au', type=str, default="data/au.csv", help="eye blink area")
|
||||||
|
|
||||||
parser.add_argument('-O', action='store_true', help="equals --fp16 --cuda_ray --exp_eye")
|
parser.add_argument('-O', action='store_true', help="equals --fp16 --cuda_ray --exp_eye")
|
||||||
|
|
||||||
|
@ -204,7 +260,18 @@ if __name__ == '__main__':
|
||||||
parser.add_argument('-m', type=int, default=50)
|
parser.add_argument('-m', type=int, default=50)
|
||||||
parser.add_argument('-r', type=int, default=10)
|
parser.add_argument('-r', type=int, default=10)
|
||||||
|
|
||||||
|
parser.add_argument('--tts', type=str, default='edgetts') #xtts
|
||||||
|
parser.add_argument('--ref_file', type=str, default=None)
|
||||||
|
parser.add_argument('--xtts_server', type=str, default='http://localhost:9000')
|
||||||
|
|
||||||
opt = parser.parse_args()
|
opt = parser.parse_args()
|
||||||
|
app.config.from_object(opt)
|
||||||
|
#print(app.config['xtts_server'])
|
||||||
|
|
||||||
|
tts_type = opt.tts
|
||||||
|
if tts_type == "xtts":
|
||||||
|
print("Computing the latents for a new reference...")
|
||||||
|
gspeaker = get_speaker(opt.ref_file, opt.xtts_server)
|
||||||
|
|
||||||
# assert test mode
|
# assert test mode
|
||||||
opt.test = True
|
opt.test = True
|
||||||
|
@ -212,17 +279,18 @@ if __name__ == '__main__':
|
||||||
#opt.train_camera =True
|
#opt.train_camera =True
|
||||||
# explicit smoothing
|
# explicit smoothing
|
||||||
opt.smooth_path = True
|
opt.smooth_path = True
|
||||||
opt.smooth_eye = True
|
|
||||||
opt.smooth_lips = True
|
opt.smooth_lips = True
|
||||||
|
|
||||||
assert opt.pose != '', 'Must provide a pose source'
|
assert opt.pose != '', 'Must provide a pose source'
|
||||||
|
|
||||||
# if opt.O:
|
# if opt.O:
|
||||||
opt.fp16 = True
|
opt.fp16 = True
|
||||||
opt.exp_eye = True
|
|
||||||
|
|
||||||
opt.cuda_ray = True
|
opt.cuda_ray = True
|
||||||
|
opt.exp_eye = True
|
||||||
|
opt.smooth_eye = True
|
||||||
|
|
||||||
opt.torso = True
|
opt.torso = True
|
||||||
|
|
||||||
# assert opt.cuda_ray, "Only support CUDA ray mode."
|
# assert opt.cuda_ray, "Only support CUDA ray mode."
|
||||||
opt.asr = True
|
opt.asr = True
|
||||||
|
|
||||||
|
@ -251,6 +319,7 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
#############################################################################
|
#############################################################################
|
||||||
print('start websocket server')
|
print('start websocket server')
|
||||||
|
|
||||||
server = pywsgi.WSGIServer(('0.0.0.0', 8000), app, handler_class=WebSocketHandler)
|
server = pywsgi.WSGIServer(('0.0.0.0', 8000), app, handler_class=WebSocketHandler)
|
||||||
server.serve_forever()
|
server.serve_forever()
|
||||||
|
|
||||||
|
|
43
asrreal.py
43
asrreal.py
|
@ -377,7 +377,7 @@ class ASR:
|
||||||
print(f'[WARN] audio has {stream.shape[1]} channels, only use the first.')
|
print(f'[WARN] audio has {stream.shape[1]} channels, only use the first.')
|
||||||
stream = stream[:, 0]
|
stream = stream[:, 0]
|
||||||
|
|
||||||
if sample_rate != self.sample_rate:
|
if sample_rate != self.sample_rate and stream.shape[0]>0:
|
||||||
print(f'[WARN] audio sample rate is {sample_rate}, resampling into {self.sample_rate}.')
|
print(f'[WARN] audio sample rate is {sample_rate}, resampling into {self.sample_rate}.')
|
||||||
stream = resampy.resample(x=stream, sr_orig=sample_rate, sr_new=self.sample_rate)
|
stream = resampy.resample(x=stream, sr_orig=sample_rate, sr_new=self.sample_rate)
|
||||||
|
|
||||||
|
@ -385,31 +385,36 @@ class ASR:
|
||||||
|
|
||||||
def push_audio(self,buffer):
|
def push_audio(self,buffer):
|
||||||
print(f'[INFO] push_audio {len(buffer)}')
|
print(f'[INFO] push_audio {len(buffer)}')
|
||||||
# if len(buffer)>0:
|
if len(buffer)>0:
|
||||||
# byte_stream=BytesIO(buffer)
|
if self.opt.tts == "xtts":
|
||||||
# stream = self.create_bytes_stream(byte_stream)
|
stream = np.frombuffer(buffer, dtype=np.int16).astype(np.float32) / 32767
|
||||||
# streamlen = stream.shape[0]
|
#stream = buffer.astype(np.float32)
|
||||||
# idx=0
|
stream = resampy.resample(x=stream, sr_orig=24000, sr_new=self.sample_rate)
|
||||||
# while streamlen >= self.chunk:
|
else:
|
||||||
# self.queue.put(stream[idx:idx+self.chunk])
|
byte_stream=BytesIO(buffer)
|
||||||
# streamlen -= self.chunk
|
stream = self.create_bytes_stream(byte_stream)
|
||||||
# idx += self.chunk
|
|
||||||
# if streamlen>0:
|
|
||||||
# self.queue.put(stream[idx:])
|
|
||||||
self.input_stream.write(buffer)
|
|
||||||
if len(buffer)<=0:
|
|
||||||
self.input_stream.seek(0)
|
|
||||||
stream = self.create_bytes_stream(self.input_stream)
|
|
||||||
streamlen = stream.shape[0]
|
streamlen = stream.shape[0]
|
||||||
idx=0
|
idx=0
|
||||||
while streamlen >= self.chunk:
|
while streamlen >= self.chunk:
|
||||||
self.queue.put(stream[idx:idx+self.chunk])
|
self.queue.put(stream[idx:idx+self.chunk])
|
||||||
streamlen -= self.chunk
|
streamlen -= self.chunk
|
||||||
idx += self.chunk
|
idx += self.chunk
|
||||||
#if streamlen>0: #skip last frame(not 20ms)
|
# if streamlen>0: #skip last frame(not 20ms)
|
||||||
# self.queue.put(stream[idx:])
|
# self.queue.put(stream[idx:])
|
||||||
self.input_stream.seek(0)
|
# self.input_stream.write(buffer)
|
||||||
self.input_stream.truncate()
|
# if len(buffer)<=0:
|
||||||
|
# self.input_stream.seek(0)
|
||||||
|
# stream = self.create_bytes_stream(self.input_stream)
|
||||||
|
# streamlen = stream.shape[0]
|
||||||
|
# idx=0
|
||||||
|
# while streamlen >= self.chunk:
|
||||||
|
# self.queue.put(stream[idx:idx+self.chunk])
|
||||||
|
# streamlen -= self.chunk
|
||||||
|
# idx += self.chunk
|
||||||
|
# #if streamlen>0: #skip last frame(not 20ms)
|
||||||
|
# # self.queue.put(stream[idx:])
|
||||||
|
# self.input_stream.seek(0)
|
||||||
|
# self.input_stream.truncate()
|
||||||
|
|
||||||
def get_audio_out(self):
|
def get_audio_out(self):
|
||||||
return self.output_queue.get()
|
return self.output_queue.get()
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Loading…
Reference in New Issue