Skip to content

Commit 3f608d2

Browse files
author
LittleMouse
committed
[update] add asr, kws model config
1 parent bea45ab commit 3f608d2

11 files changed

+281
-44
lines changed

projects/llm_framework/main_asr/mode_sense-voice-small-10s-ax650.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"mode": "sense-voice-small-10s-ax650",
33
"type": "asr",
4-
"homepage": "https://huggingface.co/yunyu1258/qwen2.5-0.5b-ha",
4+
"homepage": "https://huggingface.co/M5Stack/SenseVoiceSmall-axmodel",
55
"compile_flage": "pulsar2 build --input model-10-seconds.onnx --config config_sensevoice_main_u16.json --output_dir sensevoice-axmodel --output_name model-10-seconds.axmodel --target_hardware AX650 --compiler.check 0",
66
"pulsar_version": "5.0-patch1-fd447d0d",
77
"capabilities": [
@@ -21,7 +21,7 @@
2121
"mode_param": {
2222
"model_config.sense_voice.model": "model.axmodel",
2323
"model_config.tokens": "tokens.txt",
24-
"silero_vad.model": "silero_vad.onnx",
24+
"silero_vad.model": "silero_vad.ort",
2525
"model_config.provider": "axera",
2626
"silence_timeout": 2000,
2727
"awake_delay": 50
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
{
2+
"mode": "sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16-ax650",
3+
"type": "asr",
4+
"homepage": "https://huggingface.co/csukuangfj/k2fsa-zipformer-bilingual-zh-en-t",
5+
"capabilities": [
6+
"Automatic_Speech_Recognition",
7+
"Chinese",
8+
"English"
9+
],
10+
"input_type": [
11+
"sys.pcm",
12+
"sys.cap.0_0"
13+
],
14+
"output_type": [
15+
"asr.utf-8",
16+
"asr.bool"
17+
],
18+
"mode_param": {
19+
"model_config.transducer.encoder": "encoder.axmodel",
20+
"model_config.transducer.decoder": "decoder.axmodel",
21+
"model_config.transducer.joiner": "joiner.axmodel",
22+
"model_config.tokens": "tokens.txt",
23+
"feat_config.feature_dim": 80,
24+
"feat_config.sampling_rate": 16000,
25+
"endpoint_config.rule1.min_trailing_silence": 2.4,
26+
"endpoint_config.rule2.min_trailing_silence": 1.2,
27+
"endpoint_config.rule3.min_utterance_length": 30,
28+
"enable_endpoint": true,
29+
"awake_delay": 50,
30+
"model_config.provider_config.provider": "axera",
31+
"model_config.zipformer_meta.encoder_dims": [
32+
256,
33+
256,
34+
256,
35+
256,
36+
256
37+
],
38+
"model_config.zipformer_meta.attention_dims": [
39+
192,
40+
192,
41+
192,
42+
192,
43+
192
44+
],
45+
"model_config.zipformer_meta.num_encoder_layers": [
46+
2,
47+
2,
48+
2,
49+
2,
50+
2
51+
],
52+
"model_config.zipformer_meta.cnn_module_kernels": [
53+
31,
54+
31,
55+
31,
56+
31,
57+
31
58+
],
59+
"model_config.zipformer_meta.left_context_len": [
60+
192,
61+
96,
62+
48,
63+
24,
64+
96
65+
],
66+
"model_config.zipformer_meta.T": 103,
67+
"model_config.zipformer_meta.decode_chunk_len": 96,
68+
"model_config.zipformer_meta.context_size": 2
69+
}
70+
}

projects/llm_framework/main_asr/mode_sherpa-onnx-zipformer-bilingual-zh-en-t.json renamed to projects/llm_framework/main_asr/mode_sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
2-
"mode": "sherpa-onnx-zipformer-bilingual-zh-en-t",
2+
"mode": "sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16",
33
"type": "asr",
4-
"homepage": "",
4+
"homepage": "https://huggingface.co/csukuangfj/k2fsa-zipformer-bilingual-zh-en-t",
55
"capabilities": [
66
"Automatic_Speech_Recognition",
77
"Chinese",

projects/llm_framework/main_asr/src/main.cpp

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ class llm_task {
7373
sherpa_onnx::OnlineRecognizerConfig onnx_online_config;
7474

7575
sherpa_onnx::VadModelConfig vad_config_;
76+
std::unique_ptr<sherpa_onnx::OfflineStream> offline_stream_;
7677
std::unique_ptr<sherpa_onnx::OfflineRecognizer> onnx_recognizer_;
7778
std::unique_ptr<sherpa_onnx::OnlineRecognizer> onnx_online_recognizer_;
7879
std::unique_ptr<sherpa_onnx::OnlineStream> online_stream;
@@ -549,16 +550,16 @@ class llm_task {
549550
vad_->AcceptWaveform(floatSamples.data(), floatSamples.size());
550551
while (!vad_->Empty()) {
551552
const auto &segment = vad_->Front();
552-
auto s = onnx_recognizer_->CreateStream();
553-
s->AcceptWaveform(onnx_asr_config_.feat_config.sampling_rate, segment.samples.data(),
554-
segment.samples.size());
555-
onnx_recognizer_->DecodeStream(s.get());
556-
const auto &result = s->GetResult();
553+
if (!offline_stream_) offline_stream_ = onnx_recognizer_->CreateStream();
554+
offline_stream_->AcceptWaveform(onnx_asr_config_.feat_config.sampling_rate, segment.samples.data(),
555+
segment.samples.size());
556+
onnx_recognizer_->DecodeStream(offline_stream_.get());
557+
const auto &result = offline_stream_->GetResult();
557558
if (!result.text.empty() && out_callback_) {
558-
SLOGI("onnx-asr result: %s", result.text.c_str());
559559
out_callback_(result.text, true);
560560
}
561561
vad_->Pop();
562+
offline_stream_.reset();
562563
}
563564

564565
{

projects/llm_framework/main_audio/SConstruct

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ REQUIREMENTS += ['ax_sys', 'ax_audio', 'ax_audio_3a', 'tinyalsa', 'fdk-aac', 'op
2929
if 'CONFIG_AX_620E_MSP_ENABLED' in os.environ:
3030
REQUIREMENTS += ['ax_interpreter', 'ax_fdk', 'ax_opus']
3131

32-
STATIC_FILES += [AFile('audio.json'), AFile('audio_kit.json')]
32+
STATIC_FILES += [AFile('audio.json'), AFile('audio_kit.json'), AFile('audio_pyramid.json')]
3333
STATIC_FILES += Glob('mode_*.json')
3434

3535
env['COMPONENTS'].append({'target':'llm_audio-1.7',
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
{
2+
"mode": "None",
3+
"type": "audio",
4+
"capabilities": [
5+
"play",
6+
"cap"
7+
],
8+
"input_type": [
9+
"rpc.audio.wav.base64",
10+
"rpc.audio.pcm.base64"
11+
],
12+
"output_type": [
13+
"audio.pcm.stream"
14+
],
15+
"play_param": {
16+
"card": 0,
17+
"device": 0,
18+
"volume": 0.5,
19+
"channel": 2,
20+
"rate": 48000,
21+
"bit": 16,
22+
"stPoolConfig.MetaSize": 8192,
23+
"stPoolConfig.BlkSize": 32768,
24+
"stPoolConfig.BlkCnt": 37,
25+
"stPoolConfig.IsMergeMode": 0,
26+
"stPoolConfig.CacheMode": 0,
27+
"stPoolConfig.PartitionName": "anonymous",
28+
"stAttr.enBitwidth": 1,
29+
"stAttr.enSoundmode": 0,
30+
"stAttr.u32ChnCnt": 2,
31+
"stAttr.enLinkMode": 0,
32+
"stAttr.enSamplerate": 16000,
33+
"stAttr.U32Depth": 30,
34+
"stAttr.u32PeriodSize": 160,
35+
"stAttr.u32PeriodCount": 8,
36+
"stAttr.bInsertSilence": 0,
37+
"stVqeAttr.s32SampleRate": 16000,
38+
"stVqeAttr.u32FrameSamples": 160,
39+
"stVqeAttr.stNsCfg.bNsEnable": 0,
40+
"stVqeAttr.stNsCfg.enAggressivenessLevel": 2,
41+
"stVqeAttr.stAgcCfg.bAgcEnable": 0,
42+
"stVqeAttr.stAgcCfg.enAgcMode": 2,
43+
"stVqeAttr.stAgcCfg.s16TargetLevel": -3,
44+
"stVqeAttr.stAgcCfg.s16Gain": 9,
45+
"stHpfAttr.bEnable": 0,
46+
"stHpfAttr.s32GainDb": -3,
47+
"stHpfAttr.s32Samplerate": 16000,
48+
"stHpfAttr.s32Freq": 200,
49+
"stLpfAttr.bEnable": 0,
50+
"stLpfAttr.s32GainDb": 0,
51+
"stLpfAttr.s32Samplerate": 16000,
52+
"stLpfAttr.s32Freq": 3000,
53+
"stEqAttr.bEnable": 0,
54+
"stEqAttr.s32GainDb[0]": -10,
55+
"stEqAttr.s32GainDb[1]": -3,
56+
"stEqAttr.s32GainDb[2]": 3,
57+
"stEqAttr.s32GainDb[3]": 5,
58+
"stEqAttr.s32GainDb[4]": 10,
59+
"stEqAttr.s32Samplerate": 16000,
60+
"gResample": 0,
61+
"enInSampleRate": 16000,
62+
"gInstant": 0,
63+
"gInsertSilence": 0
64+
},
65+
"cap_param": {
66+
"sys_pcm_cap_channel": "ipc:///tmp/llm/pcm.cap.socket",
67+
"card": 0,
68+
"device": 1,
69+
"volume": 1.0,
70+
"channel": 4,
71+
"rate": 16000,
72+
"bit": 16,
73+
"stPoolConfig.MetaSize": 8192,
74+
"stPoolConfig.BlkSize": 7680,
75+
"stPoolConfig.BlkCnt": 33,
76+
"stPoolConfig.IsMergeMode": 0,
77+
"stPoolConfig.CacheMode": 0,
78+
"stPoolConfig.PartitionName": "anonymous",
79+
"aistAttr.enBitwidth": 1,
80+
"aistAttr.enLinkMode": 0,
81+
"aistAttr.enSamplerate": 16000,
82+
"aistAttr.enLayoutMode": 2,
83+
"aistAttr.U32Depth": 30,
84+
"aistAttr.u32PeriodSize": 160,
85+
"aistAttr.u32PeriodCount": 8,
86+
"aistAttr.u32ChnCnt": 2,
87+
"aistVqeAttr.s32SampleRate": 16000,
88+
"aistVqeAttr.u32FrameSamples": 160,
89+
"aistVqeAttr.stNsCfg.bNsEnable": 1,
90+
"aistVqeAttr.stNsCfg.enAggressivenessLevel": 2,
91+
"aistVqeAttr.stAgcCfg.bAgcEnable": 0,
92+
"aistVqeAttr.stAgcCfg.enAgcMode": 2,
93+
"aistVqeAttr.stAgcCfg.s16TargetLevel": -3,
94+
"aistVqeAttr.stAgcCfg.s16Gain": 9,
95+
"aistVqeAttr.stAecCfg.enAecMode": 2,
96+
"stHpfAttr.bEnable": 0,
97+
"stHpfAttr.s32GainDb": -3,
98+
"stHpfAttr.s32Samplerate": 16000,
99+
"stHpfAttr.s32Freq": 200,
100+
"stLpfAttr.bEnable": 0,
101+
"stLpfAttr.s32GainDb": 0,
102+
"stLpfAttr.s32Samplerate": 16000,
103+
"stLpfAttr.s32Freq": 3000,
104+
"stEqAttr.bEnable": 0,
105+
"stEqAttr.s32GainDb[0]": -10,
106+
"stEqAttr.s32GainDb[1]": -3,
107+
"stEqAttr.s32GainDb[2]": 3,
108+
"stEqAttr.s32GainDb[3]": 5,
109+
"stEqAttr.s32GainDb[4]": 10,
110+
"stEqAttr.s32Samplerate": 16000,
111+
"gResample": 0,
112+
"enOutSampleRate": 16000,
113+
"gDbDetection": 0
114+
}
115+
}

projects/llm_framework/main_audio/src/main.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,7 @@ class llm_audio : public StackFlow {
209209
}
210210
#else
211211
std::list<std::string> config_file_paths =
212-
get_config_file_paths(base_model_path, base_model_config_path, "audio");
212+
get_config_file_paths(base_model_path, base_model_config_path, "audio_pyramid");
213213
#endif
214214
try {
215215
config_body = nlohmann::json::parse(data);
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
{
2+
"mode": "kws-ax650",
3+
"type": "kws",
4+
"homepage": "",
5+
"capabilities": [
6+
"Keyword_spotting",
7+
"English"
8+
],
9+
"input_type": [
10+
"sys.pcm",
11+
"sys.cap.0_0"
12+
],
13+
"output_type": [
14+
"kws.bool"
15+
],
16+
"mode_param": {
17+
"model": "kws.axmodel",
18+
"model_type": "axera",
19+
"wake_wav_file": "/opt/m5stack/data/audio/wakeup_zh_cn.wav",
20+
"chunk_size": 32,
21+
"threshold": 0.9,
22+
"min_continuous_frames": 5,
23+
"REFRACTORY_TIME_MS": 2000,
24+
"RESAMPLE_RATE": 16000,
25+
"FEAT_DIM": 80,
26+
"frame_opts.samp_freq": 16000,
27+
"frame_opts.frame_length_ms": 25.0,
28+
"frame_opts.frame_shift_ms": 10.0,
29+
"frame_opts.snip_edges": false,
30+
"frame_opts.dither": 0.0,
31+
"frame_opts.preemph_coeff": 0.97,
32+
"frame_opts.remove_dc_offset": true,
33+
"frame_opts.window_type": "povey",
34+
"mel_opts.num_bins": 80,
35+
"mel_opts.low_freq": 20,
36+
"mel_opts.high_freq": 0,
37+
"energy_floor": 0.0,
38+
"use_energy": false,
39+
"raw_energy": true
40+
}
41+
}

projects/llm_framework/main_kws/mode_kws.json

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
],
1616
"mode_param": {
1717
"model": "kws.axmodel",
18-
"model_type": "onnx",
18+
"model_type": "axera",
1919
"wake_wav_file": "/opt/m5stack/data/audio/wakeup_zh_cn.wav",
2020
"chunk_size": 32,
2121
"threshold": 0.9,
@@ -37,6 +37,5 @@
3737
"energy_floor": 0.0,
3838
"use_energy": false,
3939
"raw_energy": true
40-
},
41-
"mode_param_bak": {}
40+
}
4241
}

0 commit comments

Comments
 (0)