forked from TheHolyOneZ/Text_to_speech
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSpeech.py
285 lines (228 loc) · 9.87 KB
/
Speech.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
import customtkinter as ctk
import pyttsx3
import sounddevice as sd
import numpy as np
import wave
from tkinter import filedialog, simpledialog, messagebox
from pydub import AudioSegment
from pydub.playback import play
from pydub import effects
import os
import subprocess
import platform
import urllib.request
import zipfile
import shutil
# Initialize pyttsx3 for speech synthesis
tts_engine = pyttsx3.init()
# Function to check and install FFmpeg if not present
def check_ffmpeg():
try:
subprocess.run(["ffmpeg", "-version"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
return True
except FileNotFoundError:
return False
def install_ffmpeg():
if platform.system() == "Windows":
url = "https://www.gyan.dev/ffmpeg/builds/ffmpeg-release-essentials.zip"
zip_path = "ffmpeg.zip"
# Download the FFmpeg zip file
urllib.request.urlretrieve(url, zip_path)
# Extract FFmpeg files
extract_path = os.path.join(os.getcwd(), "ffmpeg")
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall(extract_path)
# Remove the zip file after extraction
os.remove(zip_path)
# Find ffmpeg.exe in the extracted directory
ffmpeg_bin_path = None
for root, dirs, files in os.walk(extract_path):
if 'ffmpeg.exe' in files:
ffmpeg_bin_path = os.path.join(root, 'ffmpeg.exe')
break
# Check if ffmpeg.exe was found
if ffmpeg_bin_path:
# Add FFmpeg to the PATH environment variable
os.environ["PATH"] += os.pathsep + os.path.dirname(ffmpeg_bin_path)
messagebox.showinfo("FFmpeg Installation", "FFmpeg installed and added to PATH successfully.")
else:
messagebox.showerror("FFmpeg Installation Error", "ffmpeg.exe not found in extracted files.")
else:
messagebox.showinfo("FFmpeg Installation", "Please install FFmpeg manually. Visit https://ffmpeg.org/download.html")
# Function to get available voices and map them to user-friendly names
def get_available_voices():
voices = tts_engine.getProperty('voices')
available_voice_map = {}
for i, voice in enumerate(voices):
name = voice.languages[0].lower() if voice.languages else voice.name.lower()
if "male" in name:
available_voice_map['Man'] = i
elif "female" in name:
available_voice_map['Woman'] = i
else:
available_voice_map[name.capitalize()] = i
return available_voice_map
# Set pyttsx3 voice, rate, and volume properties
def set_tts_properties(voice_type, rate, volume):
available_voices = get_available_voices()
if voice_type in available_voices:
voice_index = available_voices[voice_type]
tts_engine.setProperty('voice', tts_engine.getProperty('voices')[voice_index].id)
else:
tts_engine.setProperty('voice', tts_engine.getProperty('voices')[0].id)
tts_engine.setProperty('rate', rate)
tts_engine.setProperty('volume', volume)
# Custom echo effect using pydub
def apply_echo(sound):
return sound.overlay(sound, delay=500)
# Custom reverb effect using convolution
def apply_reverb(sound):
return effects.high_pass_filter(sound, 500).overlay(effects.low_pass_filter(sound, 2000), gain_during_overlay=-6)
# Change pitch using pydub
def change_pitch(sound, pitch_factor):
new_sample_rate = int(sound.frame_rate * (2.0 ** pitch_factor))
return sound._spawn(sound.raw_data, overrides={'frame_rate': new_sample_rate}).set_frame_rate(44100)
import tempfile # for temporary file management
import os
from pydub import AudioSegment
# Set the path to the ffmpeg executable and ffprobe explicitly
ffmpeg_path = (r"your\ffmpeg.exe")
ffprobe_path = (r"your\ffprobe.exe")
# Assign FFmpeg and FFprobe paths for pydub
AudioSegment.converter = ffmpeg_path
AudioSegment.ffprobe = ffprobe_path
def speak_text_with_pitch():
text = text_entry.get("1.0", "end-1c")
voice_type = selected_voice.get()
rate = get_rate_value()
volume = volume_slider.get()
pitch = pitch_slider.get() - 1.0 # Normalize pitch between -1.0 and 1.0
apply_reverb_effect = reverb_var.get()
apply_echo_effect = echo_var.get()
# Set the text-to-speech properties
set_tts_properties(voice_type, rate, volume)
# Use pyttsx3 to synthesize the speech and save to a temporary WAV file
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_wav_file:
temp_wav_path = temp_wav_file.name
tts_engine.save_to_file(text, temp_wav_path)
tts_engine.runAndWait()
# Load the audio file using pydub
sound = AudioSegment.from_wav(temp_wav_path)
# Apply pitch change
sound_with_pitch = change_pitch(sound, pitch)
# Apply effects if selected
if apply_reverb_effect:
sound_with_pitch = apply_reverb(sound_with_pitch)
if apply_echo_effect:
sound_with_pitch = apply_echo(sound_with_pitch)
# Play the sound with pitch modification and effects
play(sound_with_pitch)
# Clean up temporary file
os.remove(temp_wav_path)
def save_text_with_pitch():
text = text_entry.get("1.0", "end-1c")
voice_type = selected_voice.get()
rate = get_rate_value()
volume = volume_slider.get()
pitch = pitch_slider.get() - 1.0 # Normalize pitch between -1.0 and 1.0
apply_reverb_effect = reverb_var.get()
apply_echo_effect = echo_var.get()
# Set the text-to-speech properties
set_tts_properties(voice_type, rate, volume)
# Use pyttsx3 to synthesize the speech and save to a temporary WAV file
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_wav_file:
temp_wav_path = temp_wav_file.name
tts_engine.save_to_file(text, temp_wav_path)
tts_engine.runAndWait()
# Load the audio file using pydub
sound = AudioSegment.from_wav(temp_wav_path)
# Apply pitch change
sound_with_pitch = change_pitch(sound, pitch)
# Apply effects if selected
if apply_reverb_effect:
sound_with_pitch = apply_reverb(sound_with_pitch)
if apply_echo_effect:
sound_with_pitch = apply_echo(sound_with_pitch)
# Ask the user where to save the modified audio
file_path = filedialog.asksaveasfilename(defaultextension=".mp3", filetypes=[("MP3 files", "*.mp3")])
if file_path:
sound_with_pitch.export(file_path, format="mp3")
# Clean up temporary file
os.remove(temp_wav_path)
# Record voice and save to WAV file
def record_voice():
duration = simpledialog.askfloat("Input", "Enter recording duration in seconds:", minvalue=1.0, maxvalue=30.0)
if not duration:
return
file_path = filedialog.asksaveasfilename(defaultextension=".wav", filetypes=[("WAV files", "*.wav")])
if not file_path:
return
fs = 44100 # Sample rate
print(f"Recording for {duration} seconds...")
recording = sd.rec(int(duration * fs), samplerate=fs, channels=2)
sd.wait() # Wait for recording to finish
print("Recording finished.")
# Save recording to a .wav file
with wave.open(file_path, 'wb') as f:
f.setnchannels(2)
f.setsampwidth(2) # Sample width in bytes
f.setframerate(fs)
f.writeframes(np.array(recording * 32767, dtype=np.int16).tobytes())
print(f"Saved recording to {file_path}")
# GUI setup
app = ctk.CTk()
app.title("Text to Speech with Dark/Light Mode")
app.geometry("600x800")
# Dark/Light mode toggle button
def toggle_mode():
if ctk.get_appearance_mode() == "Dark":
ctk.set_appearance_mode("Light")
else:
ctk.set_appearance_mode("Dark")
mode_button = ctk.CTkButton(app, text="Toggle Dark/Light Mode", command=toggle_mode)
mode_button.pack(pady=10)
# Text input field
text_entry = ctk.CTkTextbox(app, width=500, height=100)
text_entry.pack(pady=20)
# Voice selection dropdown with multiple languages
selected_voice = ctk.StringVar(value="Man")
available_voice_map = get_available_voices()
voice_menu = ctk.CTkOptionMenu(app, variable=selected_voice, values=list(available_voice_map.keys()))
voice_menu.pack(pady=10)
# Volume and pitch sliders
volume_slider = ctk.CTkSlider(app, from_=0.0, to=1.0, number_of_steps=10, width=300)
volume_slider.set(1.0) # Default volume
volume_slider.pack(pady=10)
pitch_slider = ctk.CTkSlider(app, from_=0.5, to=2.0, number_of_steps=10, width=300)
pitch_slider.set(1.0) # Default pitch (1.0 is neutral)
pitch_slider.pack(pady=10)
# Rate selection dropdown (speed control)
selected_rate = ctk.StringVar(value="Normal")
rate_menu = ctk.CTkOptionMenu(app, variable=selected_rate, values=["Slow", "Normal", "Fast"])
rate_menu.pack(pady=10)
# Reverb and Echo effect options
reverb_var = ctk.BooleanVar()
echo_var = ctk.BooleanVar()
reverb_checkbox = ctk.CTkCheckBox(app, text="Reverb", variable=reverb_var)
echo_checkbox = ctk.CTkCheckBox(app, text="Echo", variable=echo_var)
reverb_checkbox.pack(pady=10)
echo_checkbox.pack(pady=10)
# Get rate based on the selected value
def get_rate_value():
if selected_rate.get() == "Slow":
return 100
elif selected_rate.get() == "Normal":
return 150
elif selected_rate.get() == "Fast":
return 200
return 150 # Default rate
# Speak and save buttons
speak_button = ctk.CTkButton(app, text="Speak Text", command=speak_text_with_pitch)
speak_button.pack(pady=10)
save_button = ctk.CTkButton(app, text="Save as MP3", command=save_text_with_pitch)
save_button.pack(pady=10)
# Record voice button
record_button = ctk.CTkButton(app, text="Record Voice", command=record_voice)
record_button.pack(pady=20)
# Run the app
app.mainloop()