@@ -71,8 +71,16 @@ struct audio_context {
7171 mparams.use_gpu = params.mmproj_use_gpu ;
7272 mparams.print_timings = true ;
7373 mparams.n_threads = params.cpuparams .n_threads ;
74- mparams.vocoder_path = params.vocoder .model .path .c_str ();
75- mparams.tokenizer_path = params.vocoder .speaker_file .c_str ();
74+ const bool has_vocoder = !params.vocoder .model .path .empty ();
75+ const bool has_detokenizer = !params.vocoder .speaker_file .empty ();
76+ const bool enable_audio_output = has_vocoder && has_detokenizer;
77+ if (enable_audio_output) {
78+ mparams.vocoder_path = params.vocoder .model .path .c_str ();
79+ mparams.tokenizer_path = params.vocoder .speaker_file .c_str ();
80+ } else if (has_vocoder || has_detokenizer) {
81+ LOG_WRN (" %s: audio output disabled: both -mv (vocoder) and --tts-speaker-file (audio detokenizer) are required\n " ,
82+ __func__);
83+ }
7684 mtmd_ctx_audio.reset (mtmd_init_from_file (clip_path, model, mparams));
7785 if (!mtmd_ctx_audio.get ()) {
7886 LOG_ERR (" Failed to load audio model from %s\n " , clip_path);
@@ -96,8 +104,23 @@ class Runner::RunnerImpl {
96104 const text_callback_t & text_callback,
97105 const audio_callback_t & audio_callback,
98106 const std::vector<mtmd_output_modality> & modalities) {
99- mtmd_set_output_modalities (ctx.mtmd_ctx_audio .get (), modalities.data (), modalities.size ());
100- mtmd_audio_output_start_new_turn (ctx.mtmd_ctx_audio .get ());
107+ const bool audio_output_supported = mtmd_support_audio_output (ctx.mtmd_ctx_audio .get ());
108+ if (audio_output_supported) {
109+ mtmd_set_output_modalities (ctx.mtmd_ctx_audio .get (), modalities.data (), modalities.size ());
110+ mtmd_audio_output_start_new_turn (ctx.mtmd_ctx_audio .get ());
111+ } else {
112+ bool requested_audio = false ;
113+ for (const auto modality : modalities) {
114+ if (modality == MTMD_OUTPUT_MODALITY_AUDIO) {
115+ requested_audio = true ;
116+ break ;
117+ }
118+ }
119+ if (requested_audio) {
120+ LOG_WRN (" %s: requested audio output, but vocoder/audio detokenizer are not available; falling back to text-only output\n " ,
121+ __func__);
122+ }
123+ }
101124
102125 std::vector<common_chat_msg> msgs;
103126 for (const auto & message : messages) {
@@ -155,7 +178,6 @@ class Runner::RunnerImpl {
155178 for (const auto & [p, desc] : {
156179 std::pair{ params.model .path , " -m" },
157180 std::pair{ params.mmproj .path , " --mmproj" },
158- std::pair{ params.vocoder .model .path , " -mv" },
159181 }) {
160182 if (p.empty ()) {
161183 LOG_ERR (" ERR: Missing %s argument\n " , desc);
@@ -202,7 +224,12 @@ class Runner::RunnerImpl {
202224 ctx.n_past = 0 ;
203225 }
204226
205- int get_output_sample_rate () const { return mtmd_audio_output_get_sample_rate (ctx.mtmd_ctx_audio .get ()); }
227+ int get_output_sample_rate () const {
228+ if (!mtmd_support_audio_output (ctx.mtmd_ctx_audio .get ())) {
229+ return 0 ;
230+ }
231+ return mtmd_audio_output_get_sample_rate (ctx.mtmd_ctx_audio .get ());
232+ }
206233
207234 private:
208235 audio_context ctx;
0 commit comments