[update] add asr, kws model config

LittleMouse · LittleMouse · commit 3f608d2d6089 · 2025-12-19T11:59:50.000+08:00
diff --git a/projects/llm_framework/main_asr/mode_sense-voice-small-10s-ax650.json b/projects/llm_framework/main_asr/mode_sense-voice-small-10s-ax650.json
@@ -1,7 +1,7 @@
 {
     "mode": "sense-voice-small-10s-ax650",
     "type": "asr",
-    "homepage": "https://huggingface.co/yunyu1258/qwen2.5-0.5b-ha",
+    "homepage": "https://huggingface.co/M5Stack/SenseVoiceSmall-axmodel",
     "compile_flage": "pulsar2 build --input model-10-seconds.onnx --config config_sensevoice_main_u16.json --output_dir sensevoice-axmodel --output_name model-10-seconds.axmodel --target_hardware AX650 --compiler.check 0",
     "pulsar_version": "5.0-patch1-fd447d0d",
     "capabilities": [
@@ -21,7 +21,7 @@
     "mode_param": {
         "model_config.sense_voice.model": "model.axmodel",
         "model_config.tokens": "tokens.txt",
-        "silero_vad.model": "silero_vad.onnx",
+        "silero_vad.model": "silero_vad.ort",
         "model_config.provider": "axera",
         "silence_timeout": 2000,
         "awake_delay": 50
diff --git a/projects/llm_framework/main_asr/mode_sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16-ax650.json b/projects/llm_framework/main_asr/mode_sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16-ax650.json
@@ -0,0 +1,70 @@
+{
+    "mode": "sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16-ax650",
+    "type": "asr",
+    "homepage": "https://huggingface.co/csukuangfj/k2fsa-zipformer-bilingual-zh-en-t",
+    "capabilities": [
+        "Automatic_Speech_Recognition",
+        "Chinese",
+        "English"
+    ],
+    "input_type": [
+        "sys.pcm",
+        "sys.cap.0_0"
+    ],
+    "output_type": [
+        "asr.utf-8",
+        "asr.bool"
+    ],
+    "mode_param": {
+        "model_config.transducer.encoder": "encoder.axmodel",
+        "model_config.transducer.decoder": "decoder.axmodel",
+        "model_config.transducer.joiner": "joiner.axmodel",
+        "model_config.tokens": "tokens.txt",
+        "feat_config.feature_dim": 80,
+        "feat_config.sampling_rate": 16000,
+        "endpoint_config.rule1.min_trailing_silence": 2.4,
+        "endpoint_config.rule2.min_trailing_silence": 1.2,
+        "endpoint_config.rule3.min_utterance_length": 30,
+        "enable_endpoint": true,
+        "awake_delay": 50,
+        "model_config.provider_config.provider": "axera",
+        "model_config.zipformer_meta.encoder_dims": [
+            256,
+            256,
+            256,
+            256,
+            256
+        ],
+        "model_config.zipformer_meta.attention_dims": [
+            192,
+            192,
+            192,
+            192,
+            192
+        ],
+        "model_config.zipformer_meta.num_encoder_layers": [
+            2,
+            2,
+            2,
+            2,
+            2
+        ],
+        "model_config.zipformer_meta.cnn_module_kernels": [
+            31,
+            31,
+            31,
+            31,
+            31
+        ],
+        "model_config.zipformer_meta.left_context_len": [
+            192,
+            96,
+            48,
+            24,
+            96
+        ],
+        "model_config.zipformer_meta.T": 103,
+        "model_config.zipformer_meta.decode_chunk_len": 96,
+        "model_config.zipformer_meta.context_size": 2
+    }
+}
diff --git a/projects/llm_framework/main_asr/mode_sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16.json b/projects/llm_framework/main_asr/mode_sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16.json
@@ -1,7 +1,7 @@
 {
-    "mode": "sherpa-onnx-zipformer-bilingual-zh-en-t",
+    "mode": "sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16",
     "type": "asr",
-    "homepage": "",
+    "homepage": "https://huggingface.co/csukuangfj/k2fsa-zipformer-bilingual-zh-en-t",
     "capabilities": [
         "Automatic_Speech_Recognition",
         "Chinese",
diff --git a/projects/llm_framework/main_asr/src/main.cpp b/projects/llm_framework/main_asr/src/main.cpp
@@ -73,6 +73,7 @@ class llm_task {
     sherpa_onnx::OnlineRecognizerConfig onnx_online_config;
 
     sherpa_onnx::VadModelConfig vad_config_;
+    std::unique_ptr<sherpa_onnx::OfflineStream> offline_stream_;
     std::unique_ptr<sherpa_onnx::OfflineRecognizer> onnx_recognizer_;
     std::unique_ptr<sherpa_onnx::OnlineRecognizer> onnx_online_recognizer_;
     std::unique_ptr<sherpa_onnx::OnlineStream> online_stream;
@@ -549,16 +550,16 @@ class llm_task {
         vad_->AcceptWaveform(floatSamples.data(), floatSamples.size());
         while (!vad_->Empty()) {
             const auto &segment = vad_->Front();
-            auto s              = onnx_recognizer_->CreateStream();
-            s->AcceptWaveform(onnx_asr_config_.feat_config.sampling_rate, segment.samples.data(),
-                              segment.samples.size());
-            onnx_recognizer_->DecodeStream(s.get());
-            const auto &result = s->GetResult();
+            if (!offline_stream_) offline_stream_ = onnx_recognizer_->CreateStream();
+            offline_stream_->AcceptWaveform(onnx_asr_config_.feat_config.sampling_rate, segment.samples.data(),
+                                            segment.samples.size());
+            onnx_recognizer_->DecodeStream(offline_stream_.get());
+            const auto &result = offline_stream_->GetResult();
             if (!result.text.empty() && out_callback_) {
-                SLOGI("onnx-asr result: %s", result.text.c_str());
                 out_callback_(result.text, true);
             }
             vad_->Pop();
+            offline_stream_.reset();
         }
 
         {
diff --git a/projects/llm_framework/main_audio/SConstruct b/projects/llm_framework/main_audio/SConstruct
@@ -29,7 +29,7 @@ REQUIREMENTS += ['ax_sys', 'ax_audio', 'ax_audio_3a', 'tinyalsa', 'fdk-aac', 'op
 if 'CONFIG_AX_620E_MSP_ENABLED' in os.environ:
     REQUIREMENTS += ['ax_interpreter', 'ax_fdk', 'ax_opus']
 
-STATIC_FILES += [AFile('audio.json'), AFile('audio_kit.json')]
+STATIC_FILES += [AFile('audio.json'), AFile('audio_kit.json'), AFile('audio_pyramid.json')]
 STATIC_FILES += Glob('mode_*.json')
 
 env['COMPONENTS'].append({'target':'llm_audio-1.7',
diff --git a/projects/llm_framework/main_audio/audio_pyramid.json b/projects/llm_framework/main_audio/audio_pyramid.json
@@ -0,0 +1,115 @@
+{
+    "mode": "None",
+    "type": "audio",
+    "capabilities": [
+        "play",
+        "cap"
+    ],
+    "input_type": [
+        "rpc.audio.wav.base64",
+        "rpc.audio.pcm.base64"
+    ],
+    "output_type": [
+        "audio.pcm.stream"
+    ],
+    "play_param": {
+        "card": 0,
+        "device": 0,
+        "volume": 0.5,
+        "channel": 2,
+        "rate": 48000,
+        "bit": 16,
+        "stPoolConfig.MetaSize": 8192,
+        "stPoolConfig.BlkSize": 32768,
+        "stPoolConfig.BlkCnt": 37,
+        "stPoolConfig.IsMergeMode": 0,
+        "stPoolConfig.CacheMode": 0,
+        "stPoolConfig.PartitionName": "anonymous",
+        "stAttr.enBitwidth": 1,
+        "stAttr.enSoundmode": 0,
+        "stAttr.u32ChnCnt": 2,
+        "stAttr.enLinkMode": 0,
+        "stAttr.enSamplerate": 16000,
+        "stAttr.U32Depth": 30,
+        "stAttr.u32PeriodSize": 160,
+        "stAttr.u32PeriodCount": 8,
+        "stAttr.bInsertSilence": 0,
+        "stVqeAttr.s32SampleRate": 16000,
+        "stVqeAttr.u32FrameSamples": 160,
+        "stVqeAttr.stNsCfg.bNsEnable": 0,
+        "stVqeAttr.stNsCfg.enAggressivenessLevel": 2,
+        "stVqeAttr.stAgcCfg.bAgcEnable": 0,
+        "stVqeAttr.stAgcCfg.enAgcMode": 2,
+        "stVqeAttr.stAgcCfg.s16TargetLevel": -3,
+        "stVqeAttr.stAgcCfg.s16Gain": 9,
+        "stHpfAttr.bEnable": 0,
+        "stHpfAttr.s32GainDb": -3,
+        "stHpfAttr.s32Samplerate": 16000,
+        "stHpfAttr.s32Freq": 200,
+        "stLpfAttr.bEnable": 0,
+        "stLpfAttr.s32GainDb": 0,
+        "stLpfAttr.s32Samplerate": 16000,
+        "stLpfAttr.s32Freq": 3000,
+        "stEqAttr.bEnable": 0,
+        "stEqAttr.s32GainDb[0]": -10,
+        "stEqAttr.s32GainDb[1]": -3,
+        "stEqAttr.s32GainDb[2]": 3,
+        "stEqAttr.s32GainDb[3]": 5,
+        "stEqAttr.s32GainDb[4]": 10,
+        "stEqAttr.s32Samplerate": 16000,
+        "gResample": 0,
+        "enInSampleRate": 16000,
+        "gInstant": 0,
+        "gInsertSilence": 0
+    },
+    "cap_param": {
+        "sys_pcm_cap_channel": "ipc:///tmp/llm/pcm.cap.socket",
+        "card": 0,
+        "device": 1,
+        "volume": 1.0,
+        "channel": 4,
+        "rate": 16000,
+        "bit": 16,
+        "stPoolConfig.MetaSize": 8192,
+        "stPoolConfig.BlkSize": 7680,
+        "stPoolConfig.BlkCnt": 33,
+        "stPoolConfig.IsMergeMode": 0,
+        "stPoolConfig.CacheMode": 0,
+        "stPoolConfig.PartitionName": "anonymous",
+        "aistAttr.enBitwidth": 1,
+        "aistAttr.enLinkMode": 0,
+        "aistAttr.enSamplerate": 16000,
+        "aistAttr.enLayoutMode": 2,
+        "aistAttr.U32Depth": 30,
+        "aistAttr.u32PeriodSize": 160,
+        "aistAttr.u32PeriodCount": 8,
+        "aistAttr.u32ChnCnt": 2,
+        "aistVqeAttr.s32SampleRate": 16000,
+        "aistVqeAttr.u32FrameSamples": 160,
+        "aistVqeAttr.stNsCfg.bNsEnable": 1,
+        "aistVqeAttr.stNsCfg.enAggressivenessLevel": 2,
+        "aistVqeAttr.stAgcCfg.bAgcEnable": 0,
+        "aistVqeAttr.stAgcCfg.enAgcMode": 2,
+        "aistVqeAttr.stAgcCfg.s16TargetLevel": -3,
+        "aistVqeAttr.stAgcCfg.s16Gain": 9,
+        "aistVqeAttr.stAecCfg.enAecMode": 2,
+        "stHpfAttr.bEnable": 0,
+        "stHpfAttr.s32GainDb": -3,
+        "stHpfAttr.s32Samplerate": 16000,
+        "stHpfAttr.s32Freq": 200,
+        "stLpfAttr.bEnable": 0,
+        "stLpfAttr.s32GainDb": 0,
+        "stLpfAttr.s32Samplerate": 16000,
+        "stLpfAttr.s32Freq": 3000,
+        "stEqAttr.bEnable": 0,
+        "stEqAttr.s32GainDb[0]": -10,
+        "stEqAttr.s32GainDb[1]": -3,
+        "stEqAttr.s32GainDb[2]": 3,
+        "stEqAttr.s32GainDb[3]": 5,
+        "stEqAttr.s32GainDb[4]": 10,
+        "stEqAttr.s32Samplerate": 16000,
+        "gResample": 0,
+        "enOutSampleRate": 16000,
+        "gDbDetection": 0
+    }
+}
diff --git a/projects/llm_framework/main_audio/src/main.cpp b/projects/llm_framework/main_audio/src/main.cpp
@@ -209,7 +209,7 @@ class llm_audio : public StackFlow {
         }
 #else
         std::list<std::string> config_file_paths =
-            get_config_file_paths(base_model_path, base_model_config_path, "audio");
+            get_config_file_paths(base_model_path, base_model_config_path, "audio_pyramid");
 #endif
         try {
             config_body = nlohmann::json::parse(data);
diff --git a/projects/llm_framework/main_kws/mode_kws-ax650.json b/projects/llm_framework/main_kws/mode_kws-ax650.json
@@ -0,0 +1,41 @@
+{
+    "mode": "kws-ax650",
+    "type": "kws",
+    "homepage": "",
+    "capabilities": [
+        "Keyword_spotting",
+        "English"
+    ],
+    "input_type": [
+        "sys.pcm",
+        "sys.cap.0_0"
+    ],
+    "output_type": [
+        "kws.bool"
+    ],
+    "mode_param": {
+        "model": "kws.axmodel",
+        "model_type": "axera",
+        "wake_wav_file": "/opt/m5stack/data/audio/wakeup_zh_cn.wav",
+        "chunk_size": 32,
+        "threshold": 0.9,
+        "min_continuous_frames": 5,
+        "REFRACTORY_TIME_MS": 2000,
+        "RESAMPLE_RATE": 16000,
+        "FEAT_DIM": 80,
+        "frame_opts.samp_freq": 16000,
+        "frame_opts.frame_length_ms": 25.0,
+        "frame_opts.frame_shift_ms": 10.0,
+        "frame_opts.snip_edges": false,
+        "frame_opts.dither": 0.0,
+        "frame_opts.preemph_coeff": 0.97,
+        "frame_opts.remove_dc_offset": true,
+        "frame_opts.window_type": "povey",
+        "mel_opts.num_bins": 80,
+        "mel_opts.low_freq": 20,
+        "mel_opts.high_freq": 0,
+        "energy_floor": 0.0,
+        "use_energy": false,
+        "raw_energy": true
+    }
+}
diff --git a/projects/llm_framework/main_kws/mode_kws.json b/projects/llm_framework/main_kws/mode_kws.json
@@ -15,7 +15,7 @@
     ],
     "mode_param": {
         "model": "kws.axmodel",
-        "model_type": "onnx",
+        "model_type": "axera",
         "wake_wav_file": "/opt/m5stack/data/audio/wakeup_zh_cn.wav",
         "chunk_size": 32,
         "threshold": 0.9,
@@ -37,6 +37,5 @@
         "energy_floor": 0.0,
         "use_energy": false,
         "raw_energy": true
-    },
-    "mode_param_bak": {}
+    }
 }
diff --git a/projects/llm_framework/main_kws/src/main.cpp b/projects/llm_framework/main_kws/src/main.cpp
diff --git a/projects/llm_framework/tools/llm_pack.py b/projects/llm_framework/tools/llm_pack.py

Original file line number	Diff line number	Diff line change
`@@ -209,7 +209,7 @@ class llm_audio : public StackFlow {`
`209`	`209`	`}`
`210`	`210`	`#else`
`211`	`211`	`std::list<std::string> config_file_paths =`
`212`		`- get_config_file_paths(base_model_path, base_model_config_path, "audio");`
	`212`	`+ get_config_file_paths(base_model_path, base_model_config_path, "audio_pyramid");`
`213`	`213`	`#endif`
`214`	`214`	`try {`
`215`	`215`	`config_body = nlohmann::json::parse(data);`