diff --git a/README.md b/README.md index 30ec312..3f6e634 100644 --- a/README.md +++ b/README.md @@ -37,9 +37,24 @@ cargo install --path . ``` ### 3. Run Capski +Basic transcription (auto-detects language): ```bash capski --input "example/input_audio.wav" ``` +You can also translate your non-English audio to English: +```bash +capski --input "japanese_audio.wav" --translate +``` +Also explicity set the source language to translate to English: +```bash +capski --input "french_audio.wav" --language FR --translate +``` + +> [!NOTE] +> Capski uses Whisper to transcribe audio. +> If you want to translate non-English speech into English subtitles, use the --translate flag along with the --language option to specify the source language (e.g., fr for French, es for Spanish). +> +> 📌 Whisper only supports translation into English. Translating English into other languages is not supported. This runs the pipeline end-to-end: - extracts or processes audio, diff --git a/example/output_translated_sub.mp4 b/example/output_translated_sub.mp4 new file mode 100644 index 0000000..6f0d1ed Binary files /dev/null and b/example/output_translated_sub.mp4 differ diff --git a/src/audio/mod.rs b/src/audio/mod.rs index f1186cb..8055a5b 100644 --- a/src/audio/mod.rs +++ b/src/audio/mod.rs @@ -8,7 +8,12 @@ use crate::types::Segment; use anyhow::Result; pub trait Capski { - fn transcribe(model_path: &str, audio_path: &str) -> Result>; + fn transcribe( + model_path: &str, + audio_path: &str, + translate: bool, + language: &Option, + ) -> Result>; } pub trait Extractor { diff --git a/src/audio/whisper.rs b/src/audio/whisper.rs index ad58aa0..7750229 100644 --- a/src/audio/whisper.rs +++ b/src/audio/whisper.rs @@ -7,11 +7,130 @@ use whisper_rs::{FullParams, SamplingStrategy, WhisperContext, WhisperContextPar pub struct WhisperCapski; +const LANGUAGES: &[(&str, &str)] = &[ + ("en", "english"), + ("zh", "chinese"), + ("de", "german"), + ("es", "spanish"), + ("ru", "russian"), + ("ko", "korean"), + ("fr", "french"), + ("ja", "japanese"), + ("pt", "portuguese"), + ("tr", "turkish"), + ("pl", "polish"), + ("ca", "catalan"), + ("nl", "dutch"), + ("ar", "arabic"), + ("sv", "swedish"), + ("it", "italian"), + ("id", "indonesian"), + ("hi", "hindi"), + ("fi", "finnish"), + ("vi", "vietnamese"), + ("he", "hebrew"), + ("uk", "ukrainian"), + ("el", "greek"), + ("ms", "malay"), + ("cs", "czech"), + ("ro", "romanian"), + ("da", "danish"), + ("hu", "hungarian"), + ("ta", "tamil"), + ("no", "norwegian"), + ("th", "thai"), + ("ur", "urdu"), + ("hr", "croatian"), + ("bg", "bulgarian"), + ("lt", "lithuanian"), + ("la", "latin"), + ("mi", "maori"), + ("ml", "malayalam"), + ("cy", "welsh"), + ("sk", "slovak"), + ("te", "telugu"), + ("fa", "persian"), + ("lv", "latvian"), + ("bn", "bengali"), + ("sr", "serbian"), + ("az", "azerbaijani"), + ("sl", "slovenian"), + ("kn", "kannada"), + ("et", "estonian"), + ("mk", "macedonian"), + ("br", "breton"), + ("eu", "basque"), + ("is", "icelandic"), + ("hy", "armenian"), + ("ne", "nepali"), + ("mn", "mongolian"), + ("bs", "bosnian"), + ("kk", "kazakh"), + ("sq", "albanian"), + ("sw", "swahili"), + ("gl", "galician"), + ("mr", "marathi"), + ("pa", "punjabi"), + ("si", "sinhala"), + ("km", "khmer"), + ("sn", "shona"), + ("yo", "yoruba"), + ("so", "somali"), + ("af", "afrikaans"), + ("oc", "occitan"), + ("ka", "georgian"), + ("be", "belarusian"), + ("tg", "tajik"), + ("sd", "sindhi"), + ("gu", "gujarati"), + ("am", "amharic"), + ("yi", "yiddish"), + ("lo", "lao"), + ("uz", "uzbek"), + ("fo", "faroese"), + ("ht", "haitian creole"), + ("ps", "pashto"), + ("tk", "turkmen"), + ("nn", "nynorsk"), + ("mt", "maltese"), + ("sa", "sanskrit"), + ("lb", "luxembourgish"), + ("my", "myanmar"), + ("bo", "tibetan"), + ("tl", "tagalog"), + ("mg", "malagasy"), + ("as", "assamese"), + ("tt", "tatar"), + ("haw", "hawaiian"), + ("ln", "lingala"), + ("ha", "hausa"), + ("ba", "bashkir"), + ("jw", "javanese"), + ("su", "sundanese"), +]; + impl Capski for WhisperCapski { // Function to transcribe audio using the Whisper model - fn transcribe(model_path: &str, audio_path: &str) -> Result> { + fn transcribe( + model_path: &str, + audio_path: &str, + translate: bool, + language: &Option, + ) -> Result> { info!("Transcribing with Whisper..."); + let language_code = language + .as_ref() + .map(|s| s.to_lowercase()) + .as_ref() + .and_then(|lang| { + LANGUAGES + .iter() + .find(|&&(code, name)| code == lang || name == lang) + .map(|&(code, _)| code) + }) + .unwrap_or("auto"); + let reader = hound::WavReader::open(audio_path) .with_context(|| format!("failed to open audio file: {}", audio_path))?; @@ -24,8 +143,8 @@ impl Capski for WhisperCapski { // Set up parameters for the Whisper model let mut params = FullParams::new(SamplingStrategy::Greedy { best_of: 0 }); - params.set_translate(false); - params.set_language(Some("auto")); + params.set_translate(translate); + params.set_language(Some(language_code)); params.set_print_special(false); params.set_print_progress(false); params.set_print_realtime(false); diff --git a/src/cli.rs b/src/cli.rs index 899dfdc..1cf3a96 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -3,9 +3,9 @@ use clap::Parser; #[derive(Parser, Debug)] #[command( name = "Capski", - version = "0.1.0", + version = "0.2.0", author = "Chris Dedman", - about = "Create karaoke-style videos from audio or video", + about = "Create karaoke-style videos from audio or video.", disable_help_flag = false, disable_version_flag = false )] @@ -15,4 +15,18 @@ pub struct Opts { #[arg(short, long, default_value = "output.mp4")] pub output: String, + + #[arg( + long, + default_value_t = false, + help = "Translate from the source language to English." + )] + pub translate: bool, + + #[arg( + long, + default_value = "auto", + help = "Specify the source language ('fr', 'es', etc). Defaults to 'auto'." + )] + pub language: String, } diff --git a/src/main.rs b/src/main.rs index feca465..f6aec34 100644 --- a/src/main.rs +++ b/src/main.rs @@ -16,6 +16,8 @@ fn main() -> Result<()> { output: opts.output, model_path: "model/ggml-tiny.bin".to_string(), style, + translate: opts.translate, + language: Some(opts.language), }; app.run() diff --git a/src/pipeline.rs b/src/pipeline.rs index d12a2b9..3f12c9a 100644 --- a/src/pipeline.rs +++ b/src/pipeline.rs @@ -9,6 +9,8 @@ use std::path::Path; pub struct CapskiApp { pub input: String, pub output: String, + pub translate: bool, + pub language: Option, pub model_path: String, pub style: StyleConfig, } @@ -27,7 +29,12 @@ impl CapskiApp { let subtitle_path = build_dir.join(format!("{}.ass", base)); FfmpegExtractor::extract(&self.input, audio_path.to_str().unwrap())?; - let segments = WhisperCapski::transcribe(&self.model_path, audio_path.to_str().unwrap())?; + let segments = WhisperCapski::transcribe( + &self.model_path, + audio_path.to_str().unwrap(), + self.translate, + &self.language, + )?; SubtitleGenerator::generate(segments, subtitle_path.to_str().unwrap(), &self.style)?; SubtitleGenerator::burn(