mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	examples : add embd_to_audio to tts-outetts.py [no ci] (#11235)
This commit contains a suggestion for adding the missing embd_to_audio function from tts.cpp to tts-outetts.py. This introduces a depencency numpy which I was not sure if that is acceptable or not (only PyTorch was mentioned in referened PR). Also the README has been updated with instructions to run the example with llama-server and the python script. Refs: https://github.com/ggerganov/llama.cpp/pull/10784#issuecomment-2548377734
This commit is contained in:
		| @@ -78,3 +78,40 @@ play the audio: | ||||
| $ aplay output.wav | ||||
| ``` | ||||
|  | ||||
| ### Running the example with llama-server | ||||
| Running this example with `llama-server` is also possible and requires two | ||||
| server instances to be started. One will serve the LLM model and the other | ||||
| will serve the voice decoder model. | ||||
|  | ||||
| The LLM model server can be started with the following command: | ||||
| ```console | ||||
| $ ./build/bin/llama-server -m ./models/outetts-0.2-0.5B-q8_0.gguf --port 8020 | ||||
| ``` | ||||
|  | ||||
| And the voice decoder model server can be started using: | ||||
| ```console | ||||
| ./build/bin/llama-server -m ./models/wavtokenizer-large-75-f16.gguf --port 8021 --embeddings --pooling none | ||||
| ``` | ||||
|  | ||||
| Then we can run [tts-outetts.py](tts-outetts.py) to generate the audio. | ||||
|  | ||||
| First create a virtual environment for python and install the required | ||||
| dependencies (this in only required to be done once): | ||||
| ```console | ||||
| $ python3 -m venv venv | ||||
| $ source venv/bin/activate | ||||
| (venv) pip install requests numpy | ||||
| ``` | ||||
|  | ||||
| And then run the python script using: | ||||
| ```conole | ||||
| (venv) python ./examples/tts/tts-outetts.py http://localhost:8020 http://localhost:8021 "Hello world" | ||||
| spectrogram generated: n_codes: 90, n_embd: 1282 | ||||
| converting to audio ... | ||||
| audio generated: 28800 samples | ||||
| audio written to file "output.wav" | ||||
| ``` | ||||
| And to play the audio we can again use aplay or any other media player: | ||||
| ```console | ||||
| $ aplay output.wav | ||||
| ``` | ||||
|   | ||||
| @@ -3,6 +3,121 @@ import sys | ||||
| #import struct | ||||
| import requests | ||||
| import re | ||||
| import struct | ||||
| import numpy as np | ||||
| from concurrent.futures import ThreadPoolExecutor | ||||
|  | ||||
|  | ||||
| def fill_hann_window(size, periodic=True): | ||||
|     if periodic: | ||||
|         return np.hanning(size + 1)[:-1] | ||||
|     return np.hanning(size) | ||||
|  | ||||
|  | ||||
| def irfft(n_fft, complex_input): | ||||
|     return np.fft.irfft(complex_input, n=n_fft) | ||||
|  | ||||
|  | ||||
| def fold(buffer, n_out, n_win, n_hop, n_pad): | ||||
|     result = np.zeros(n_out) | ||||
|     n_frames = len(buffer) // n_win | ||||
|  | ||||
|     for i in range(n_frames): | ||||
|         start = i * n_hop | ||||
|         end = start + n_win | ||||
|         result[start:end] += buffer[i * n_win:(i + 1) * n_win] | ||||
|  | ||||
|     return result[n_pad:-n_pad] if n_pad > 0 else result | ||||
|  | ||||
|  | ||||
| def process_frame(args): | ||||
|     l, n_fft, ST, hann = args | ||||
|     frame = irfft(n_fft, ST[l]) | ||||
|     frame = frame * hann | ||||
|     hann2 = hann * hann | ||||
|     return frame, hann2 | ||||
|  | ||||
|  | ||||
| def embd_to_audio(embd, n_codes, n_embd, n_thread=4): | ||||
|     embd = np.asarray(embd, dtype=np.float32).reshape(n_codes, n_embd) | ||||
|  | ||||
|     n_fft = 1280 | ||||
|     n_hop = 320 | ||||
|     n_win = 1280 | ||||
|     n_pad = (n_win - n_hop) // 2 | ||||
|     n_out = (n_codes - 1) * n_hop + n_win | ||||
|  | ||||
|     hann = fill_hann_window(n_fft, True) | ||||
|  | ||||
|     E = np.zeros((n_embd, n_codes), dtype=np.float32) | ||||
|     for l in range(n_codes): | ||||
|         for k in range(n_embd): | ||||
|             E[k, l] = embd[l, k] | ||||
|  | ||||
|     half_embd = n_embd // 2 | ||||
|     S = np.zeros((n_codes, half_embd + 1), dtype=np.complex64) | ||||
|  | ||||
|     for k in range(half_embd): | ||||
|         for l in range(n_codes): | ||||
|             mag = E[k, l] | ||||
|             phi = E[k + half_embd, l] | ||||
|  | ||||
|             mag = np.clip(np.exp(mag), 0, 1e2) | ||||
|             S[l, k] = mag * np.exp(1j * phi) | ||||
|  | ||||
|     res = np.zeros(n_codes * n_fft) | ||||
|     hann2_buffer = np.zeros(n_codes * n_fft) | ||||
|  | ||||
|     with ThreadPoolExecutor(max_workers=n_thread) as executor: | ||||
|         args = [(l, n_fft, S, hann) for l in range(n_codes)] | ||||
|         results = list(executor.map(process_frame, args)) | ||||
|  | ||||
|         for l, (frame, hann2) in enumerate(results): | ||||
|             res[l*n_fft:(l+1)*n_fft] = frame | ||||
|             hann2_buffer[l*n_fft:(l+1)*n_fft] = hann2 | ||||
|  | ||||
|     audio = fold(res, n_out, n_win, n_hop, n_pad) | ||||
|     env = fold(hann2_buffer, n_out, n_win, n_hop, n_pad) | ||||
|  | ||||
|     mask = env > 1e-10 | ||||
|     audio[mask] /= env[mask] | ||||
|  | ||||
|     return audio | ||||
|  | ||||
|  | ||||
| def save_wav(filename, audio_data, sample_rate): | ||||
|     num_channels = 1 | ||||
|     bits_per_sample = 16 | ||||
|     bytes_per_sample = bits_per_sample // 8 | ||||
|     data_size = len(audio_data) * bytes_per_sample | ||||
|     byte_rate = sample_rate * num_channels * bytes_per_sample | ||||
|     block_align = num_channels * bytes_per_sample | ||||
|     chunk_size = 36 + data_size  # 36 = size of header minus first 8 bytes | ||||
|  | ||||
|     header = struct.pack( | ||||
|         '<4sI4s4sIHHIIHH4sI', | ||||
|         b'RIFF', | ||||
|         chunk_size, | ||||
|         b'WAVE', | ||||
|         b'fmt ', | ||||
|         16,                # fmt chunk size | ||||
|         1,                 # audio format (PCM) | ||||
|         num_channels, | ||||
|         sample_rate, | ||||
|         byte_rate, | ||||
|         block_align, | ||||
|         bits_per_sample, | ||||
|         b'data', | ||||
|         data_size | ||||
|     ) | ||||
|  | ||||
|     audio_data = np.clip(audio_data * 32767, -32768, 32767) | ||||
|     pcm_data = audio_data.astype(np.int16) | ||||
|  | ||||
|     with open(filename, 'wb') as f: | ||||
|         f.write(header) | ||||
|         f.write(pcm_data.tobytes()) | ||||
|  | ||||
|  | ||||
| def process_text(text: str): | ||||
|     text = re.sub(r'\d+(\.\d+)?', lambda x: x.group(), text.lower()) # TODO this needs to be fixed | ||||
| @@ -170,6 +285,15 @@ n_embd = len(embd[0]) | ||||
| print('spectrogram generated: n_codes: %d, n_embd: %d' % (n_codes, n_embd)) | ||||
|  | ||||
| # post-process the spectrogram to convert to audio | ||||
| # TODO: see the tts.cpp:embd_to_audio() and implement it in Python | ||||
| print('converting to audio ...') | ||||
| print('TODO: see the tts.cpp:embd_to_audio() and implement it in Python') | ||||
| audio = embd_to_audio(embd, n_codes, n_embd) | ||||
| print('audio generated: %d samples' % len(audio)) | ||||
|  | ||||
| filename = "output.wav" | ||||
| sample_rate = 24000 # sampling rate | ||||
|  | ||||
| # zero out first 0.25 seconds | ||||
| audio[:24000 // 4] = 0.0 | ||||
|  | ||||
| save_wav(filename, audio, sample_rate) | ||||
| print('audio written to file "%s"' % filename) | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Daniel Bevenius
					Daniel Bevenius