-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathparakeet_cli.cpp
More file actions
251 lines (222 loc) · 10.4 KB
/
parakeet_cli.cpp
File metadata and controls
251 lines (222 loc) · 10.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
// Copyright (C) 2025 Eddy SDK
// SPDX-License-Identifier: Apache-2.0
#include "eddy/backends/openvino_backend.hpp"
#include "eddy/core/app_dir.hpp"
#include "eddy/models/parakeet-v2/parakeet.hpp"
#include "eddy/models/parakeet-v2/parakeet_openvino.hpp"
#include "eddy/utils/ensure_models.hpp"
#include "eddy/utils/audio_utils.hpp"
#include <chrono>
#include <filesystem>
#include <fstream>
#include <sstream>
#include <iostream>
#include <iomanip>
#include <string>
#include <vector>
void print_usage(const char* program_name) {
std::cout << "Usage: " << program_name << " <audio.wav> [options]\n\n";
std::cout << "Options:\n";
std::cout << " --model <model> Model version (default: parakeet-v2)\n";
std::cout << " Options: parakeet-v2, parakeet-v3\n";
std::cout << " --device <device> OpenVINO device (default: CPU)\n";
std::cout << " Options: CPU, AUTO\n";
std::cout << " --help Show this help message\n\n";
std::cout << "Requirements:\n";
std::cout << " - Audio must be 16kHz mono or stereo WAV file\n";
std::cout << " - Models will be loaded from cache or models/parakeet/\n\n";
std::cout << "Example:\n";
std::cout << " " << program_name << " test.wav\n";
std::cout << " " << program_name << " test.wav --model parakeet-v3 --device CPU\n";
}
int main(int argc, char* argv[]) {
// Force unbuffered output
std::cout.setf(std::ios::unitbuf);
std::cerr.setf(std::ios::unitbuf);
// Parse command line arguments
if (argc < 2) {
print_usage(argv[0]);
return 1;
}
std::string audio_file;
std::string device = "CPU";
std::string model_name = "parakeet-v2";
for (int i = 1; i < argc; i++) {
std::string arg = argv[i];
if (arg == "--help" || arg == "-h") {
print_usage(argv[0]);
return 0;
} else if (arg == "--model") {
if (i + 1 >= argc) {
std::cerr << "Error: --model requires an argument\n";
return 1;
}
model_name = argv[++i];
if (model_name != "parakeet-v2" && model_name != "parakeet-v3") {
std::cerr << "Error: Invalid model. Use 'parakeet-v2' or 'parakeet-v3'\n";
return 1;
}
} else if (arg == "--device") {
if (i + 1 >= argc) {
std::cerr << "Error: --device requires an argument\n";
return 1;
}
device = argv[++i];
} else {
// Assume it's the audio file
audio_file = arg;
}
}
if (audio_file.empty()) {
std::cerr << "Error: No audio file specified\n\n";
print_usage(argv[0]);
return 1;
}
std::cout << "=== Parakeet TDT Transcription CLI (" << model_name << ") ===\n\n";
try {
// Load audio file
std::cout << "Loading audio: " << audio_file << " ... ";
std::cout.flush();
auto audio_samples = eddy::audio::read_wav(audio_file);
std::cout << "[OK]\n";
std::cout << " Samples: " << audio_samples.size() << "\n";
std::cout << " Duration: " << std::fixed << std::setprecision(2)
<< (audio_samples.size() / 16000.0) << " seconds\n\n";
// Create OpenVINO backend
std::cout << "Initializing OpenVINO backend (" << device << ") ... ";
std::cout.flush();
// Set compiled model cache to the per-model cache dir
auto compiled_cache_dir = eddy::get_model_dir(model_name).string();
eddy::OpenVINOOptions ov_opts;
ov_opts.device = device;
ov_opts.cache_dir = compiled_cache_dir;
auto backend = std::make_shared<eddy::OpenVINOBackend>(ov_opts);
std::cout << "[OK]\n";
// Determine model directory: ensure cache has required files (centralized helper)
auto cache_model_dir = eddy::get_model_assets_dir(model_name);
std::filesystem::path model_dir;
std::string fetch_err;
if (!eddy::parakeet::check_models_available(cache_model_dir, &fetch_err)) {
if (!fetch_err.empty()) std::cout << "[INFO] " << fetch_err << "\n";
}
// Prefer cache if encoder xml exists (minimum signal of a complete set)
auto exists_nonempty = [](const std::filesystem::path& p) -> bool {
std::error_code ec;
auto size = std::filesystem::file_size(p, ec);
return !ec && size > 0;
};
if (exists_nonempty(cache_model_dir / "parakeet_encoder.xml")) {
model_dir = cache_model_dir;
std::cout << "Using cached models at: " << cache_model_dir.string() << "\n\n";
} else {
// Fallback: legacy Windows path (%LOCALAPPDATA%\eddy\cache\models\<name>\files)
#if defined(_WIN32)
auto legacy_dir = eddy::get_app_data_dir() / "cache" / "models" / model_name / "files";
if (exists_nonempty(legacy_dir / "parakeet_encoder.xml")) {
model_dir = legacy_dir;
std::cout << "Using legacy cached models at: " << legacy_dir.string() << "\n\n";
} else
#endif
{
model_dir = "models/parakeet";
std::cout << "Using local models at: " << model_dir.string() << "\n";
std::cout << "Note: Copy models to " << cache_model_dir.string() << " for user cache access\n\n";
}
}
// Configure model paths
eddy::parakeet::ModelPaths paths{
.preprocessor = {.path = (model_dir / "parakeet_melspectogram.xml").string()},
.encoder = {.path = (model_dir / "parakeet_encoder.xml").string()},
.decoder = {.path = (model_dir / "parakeet_decoder.xml").string()},
.joint = {.path = (model_dir / "parakeet_joint.xml").string()},
.tokenizer_json = (model_dir / "parakeet_vocab.json").string()
};
// Configure runtime (v2 uses blank_token_id=1024, v3 uses blank_token_id=8192)
int blank_token_id = (model_name == "parakeet-v3") ? 8192 : 1024;
eddy::parakeet::RuntimeConfig cfg{
.device = device,
.blank_token_id = blank_token_id,
.duration_bins = {0, 1, 2, 3, 4}
};
// Load models
std::cout << "Loading Parakeet models ... ";
std::cout.flush();
auto model = eddy::parakeet::make_openvino_parakeet(backend, paths, cfg);
std::cout << "[OK]\n";
// Warmup
std::cout << "Warming up model ... ";
std::cout.flush();
auto parakeet_model = std::static_pointer_cast<eddy::parakeet::OpenVINOParakeet>(model);
parakeet_model->warmup();
std::cout << "[OK]\n\n";
// Prepare audio segment
eddy::parakeet::AudioSegment segment;
segment.sample_rate = 16000;
segment.pcm = audio_samples;
// Run inference
std::cout << std::string(70, '=') << "\n";
std::cout << "TRANSCRIBING...\n";
std::cout << std::string(70, '=') << "\n\n";
eddy::parakeet::SegmentOptions options;
auto start = std::chrono::high_resolution_clock::now();
auto result = model->infer(segment, options);
auto end = std::chrono::high_resolution_clock::now();
// Calculate metrics
auto duration_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
float audio_duration = audio_samples.size() / 16000.0f;
float rtfx = (duration_ms > 0) ? audio_duration / (duration_ms / 1000.0f) : 0.0f;
// Display results
std::cout << "Result:\n";
std::cout << std::string(70, '-') << "\n";
std::cout << result.text << "\n";
std::cout << std::string(70, '-') << "\n\n";
std::cout << "Metrics:\n";
std::cout << " Tokens: " << result.token_ids.size() << "\n";
std::cout << " Confidence: " << std::fixed << std::setprecision(1)
<< (result.overall_confidence * 100.0f) << "%\n";
std::cout << " Processing time: " << duration_ms << " ms\n";
std::cout << " Audio duration: " << std::fixed << std::setprecision(2)
<< audio_duration << " s\n";
std::cout << " Real-time factor: " << std::fixed << std::setprecision(1)
<< rtfx << "x\n\n";
// Show token timings (first 10 tokens as sample)
if (!result.token_timings.empty()) {
std::cout << "Token Timings (first 10):\n";
const size_t num_to_show = std::min(size_t(10), result.token_timings.size());
for (size_t i = 0; i < num_to_show; ++i) {
const auto& timing = result.token_timings[i];
// Convert frame_index to seconds (frame * 0.08)
float time_seconds = timing.frame_index * 0.08f;
std::cout << " " << std::setw(3) << i+1 << ". "
<< "t=" << std::fixed << std::setprecision(2) << std::setw(5) << time_seconds << "s "
<< "conf=" << std::setprecision(1) << std::setw(4) << (timing.confidence * 100.0f) << "% "
<< "token_id=" << timing.token_id << "\n";
}
if (result.token_timings.size() > num_to_show) {
std::cout << " ... and " << (result.token_timings.size() - num_to_show) << " more tokens\n";
}
std::cout << "\n";
}
// Performance assessment
if (rtfx >= 10.0f) {
std::cout << "✅ Performance: Excellent (>" << std::setprecision(0) << rtfx << "x real-time)\n";
} else if (rtfx >= 1.0f) {
std::cout << "✅ Performance: Good (processing faster than real-time)\n";
} else {
std::cout << "⚠️ Performance: Below real-time (consider optimizations)\n";
}
std::cout << "\n" << std::string(70, '=') << "\n";
std::cout << "SUCCESS\n";
std::cout << std::string(70, '=') << "\n";
return 0;
} catch (const std::exception& e) {
std::cerr << "\n[ERROR] " << e.what() << "\n\n";
std::cerr << "Troubleshooting:\n";
std::cerr << " 1. Ensure audio file is 16kHz WAV format\n";
std::cerr << " 2. Check models are in: " << eddy::get_model_assets_dir(model_name).string() << "\n";
std::cerr << " or in: models/parakeet/\n";
std::cerr << " 3. Verify OpenVINO runtime is properly installed\n";
std::cerr << " 4. Try --device CPU if AUTO fails\n";
return 1;
}
}