Skip to content

Commit 39ff210

Browse files
committed
Make vocoder and audiotokenizer optional [no ci]
1 parent ac98513 commit 39ff210

File tree

2 files changed

+35
-8
lines changed

2 files changed

+35
-8
lines changed

tools/liquid-audio/runner.cpp

Lines changed: 33 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -71,8 +71,16 @@ struct audio_context {
7171
mparams.use_gpu = params.mmproj_use_gpu;
7272
mparams.print_timings = true;
7373
mparams.n_threads = params.cpuparams.n_threads;
74-
mparams.vocoder_path = params.vocoder.model.path.c_str();
75-
mparams.tokenizer_path = params.vocoder.speaker_file.c_str();
74+
const bool has_vocoder = !params.vocoder.model.path.empty();
75+
const bool has_detokenizer = !params.vocoder.speaker_file.empty();
76+
const bool enable_audio_output = has_vocoder && has_detokenizer;
77+
if (enable_audio_output) {
78+
mparams.vocoder_path = params.vocoder.model.path.c_str();
79+
mparams.tokenizer_path = params.vocoder.speaker_file.c_str();
80+
} else if (has_vocoder || has_detokenizer) {
81+
LOG_WRN("%s: audio output disabled: both -mv (vocoder) and --tts-speaker-file (audio detokenizer) are required\n",
82+
__func__);
83+
}
7684
mtmd_ctx_audio.reset(mtmd_init_from_file(clip_path, model, mparams));
7785
if (!mtmd_ctx_audio.get()) {
7886
LOG_ERR("Failed to load audio model from %s\n", clip_path);
@@ -96,8 +104,23 @@ class Runner::RunnerImpl {
96104
const text_callback_t & text_callback,
97105
const audio_callback_t & audio_callback,
98106
const std::vector<mtmd_output_modality> & modalities) {
99-
mtmd_set_output_modalities(ctx.mtmd_ctx_audio.get(), modalities.data(), modalities.size());
100-
mtmd_audio_output_start_new_turn(ctx.mtmd_ctx_audio.get());
107+
const bool audio_output_supported = mtmd_support_audio_output(ctx.mtmd_ctx_audio.get());
108+
if (audio_output_supported) {
109+
mtmd_set_output_modalities(ctx.mtmd_ctx_audio.get(), modalities.data(), modalities.size());
110+
mtmd_audio_output_start_new_turn(ctx.mtmd_ctx_audio.get());
111+
} else {
112+
bool requested_audio = false;
113+
for (const auto modality : modalities) {
114+
if (modality == MTMD_OUTPUT_MODALITY_AUDIO) {
115+
requested_audio = true;
116+
break;
117+
}
118+
}
119+
if (requested_audio) {
120+
LOG_WRN("%s: requested audio output, but vocoder/audio detokenizer are not available; falling back to text-only output\n",
121+
__func__);
122+
}
123+
}
101124

102125
std::vector<common_chat_msg> msgs;
103126
for (const auto & message : messages) {
@@ -155,7 +178,6 @@ class Runner::RunnerImpl {
155178
for (const auto & [p, desc] : {
156179
std::pair{ params.model.path, "-m" },
157180
std::pair{ params.mmproj.path, "--mmproj" },
158-
std::pair{ params.vocoder.model.path, "-mv" },
159181
}) {
160182
if (p.empty()) {
161183
LOG_ERR("ERR: Missing %s argument\n", desc);
@@ -202,7 +224,12 @@ class Runner::RunnerImpl {
202224
ctx.n_past = 0;
203225
}
204226

205-
int get_output_sample_rate() const { return mtmd_audio_output_get_sample_rate(ctx.mtmd_ctx_audio.get()); }
227+
int get_output_sample_rate() const {
228+
if (!mtmd_support_audio_output(ctx.mtmd_ctx_audio.get())) {
229+
return 0;
230+
}
231+
return mtmd_audio_output_get_sample_rate(ctx.mtmd_ctx_audio.get());
232+
}
206233

207234
private:
208235
audio_context ctx;

tools/liquid-audio/server.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,8 @@ static void sigint_handler(int signo) {
4747

4848
static void show_additional_info(int /*argc*/, char ** argv) {
4949
LOG("CLI for LFM2.5-Audio-1.5B\n\n"
50-
"Usage: %s [options] -m <model.gguf> --mmproj <mmproj.gguf> -mv <vocoder.gguf> --tts-speaker-file "
51-
"<tokenizer.gguf>\n",
50+
"Usage: %s [options] -m <model.gguf> --mmproj <mmproj.gguf> "
51+
"[-mv <vocoder.gguf> --tts-speaker-file <tokenizer.gguf>]\n",
5252
argv[0]);
5353
}
5454

0 commit comments

Comments
 (0)