Add wake word to xmini-c3 (#730)

* esp-hi: MCP protocol is not ready yet * Add wake word to xmini-c3
2025-05-31 22:21:03 +08:00 · 2025-05-31 22:21:03 +08:00 · ae57131c15
commit ae57131c15
parent 6cb025859f
27 changed files with 399 additions and 155 deletions
--- a/.gitignore
+++ b/.gitignore
@ -10,5 +10,6 @@ dependencies.lock
 .env
 releases/
 main/assets/lang_config.h
+main/mmap_generate_emoji.h
 .DS_Store
 .cache
--- a/main/CMakeLists.txt
+++ b/main/CMakeLists.txt
@ -194,13 +194,14 @@ list(APPEND SOURCES ${BOARD_SOURCES})
 if(CONFIG_USE_AUDIO_PROCESSOR)
    list(APPEND SOURCES "audio_processing/afe_audio_processor.cc")
 else()
-    list(APPEND SOURCES "audio_processing/dummy_audio_processor.cc")
+    list(APPEND SOURCES "audio_processing/no_audio_processor.cc")
 endif()
-if(CONFIG_USE_WAKE_WORD_DETECT)
-    list(APPEND SOURCES "audio_processing/wake_word_detect.cc")
-endif()
-if(CONFIG_USE_WAKE_WORD_DETECT_NO_AFE)
-    list(APPEND SOURCES "audio_processing/wake_word_no_afe.cc")
+if(CONFIG_USE_AFE_WAKE_WORD)
+    list(APPEND SOURCES "audio_processing/afe_wake_word.cc")
+elseif(CONFIG_USE_ESP_WAKE_WORD)
+    list(APPEND SOURCES "audio_processing/esp_wake_word.cc")
+else()
+    list(APPEND SOURCES "audio_processing/no_wake_word.cc")
 endif()

 # 根据Kconfig选择语言目录
--- a/main/Kconfig.projbuild
+++ b/main/Kconfig.projbuild
@ -30,152 +30,226 @@ choice BOARD_TYPE
        Board type. 开发板类型
    config BOARD_TYPE_BREAD_COMPACT_WIFI
        bool "面包板新版接线（WiFi）"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_BREAD_COMPACT_WIFI_LCD
        bool "面包板新版接线（WiFi）+ LCD"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_BREAD_COMPACT_ML307
        bool "面包板新版接线（ML307 AT）"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_BREAD_COMPACT_ESP32
        bool "面包板（WiFi） ESP32 DevKit"
+        depends on IDF_TARGET_ESP32
    config BOARD_TYPE_BREAD_COMPACT_ESP32_LCD
        bool "面包板（WiFi+ LCD） ESP32 DevKit"
+        depends on IDF_TARGET_ESP32
    config BOARD_TYPE_XMINI_C3
        bool "虾哥 Mini C3"
+        depends on IDF_TARGET_ESP32C3
    config BOARD_TYPE_ESP32S3_KORVO2_V3
        bool "ESP32S3_KORVO2_V3开发板"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_ESP_SPARKBOT
        bool "ESP-SparkBot开发板"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_ESP_SPOT_S3
        bool "ESP-Spot-S3"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_ESP_HI
        bool "ESP-HI"
+        depends on IDF_TARGET_ESP32C3
    config BOARD_TYPE_ESP_BOX_3
        bool "ESP BOX 3"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_ESP_BOX
        bool "ESP BOX"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_ESP_BOX_LITE
        bool "ESP BOX Lite"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_KEVIN_BOX_1
        bool "Kevin Box 1"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_KEVIN_BOX_2
        bool "Kevin Box 2"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_KEVIN_C3
        bool "Kevin C3"
+        depends on IDF_TARGET_ESP32C3
    config BOARD_TYPE_KEVIN_SP_V3_DEV
        bool "Kevin SP V3开发板"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_KEVIN_SP_V4_DEV
        bool "Kevin SP V4开发板"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_ESP32_CGC
        bool "ESP32 CGC"
+        depends on IDF_TARGET_ESP32
    config BOARD_TYPE_KEVIN_YUYING_313LCD
        bool "鱼鹰科技3.13LCD开发板"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_LICHUANG_DEV
        bool "立创·实战派ESP32-S3开发板"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_LICHUANG_C3_DEV
        bool "立创·实战派ESP32-C3开发板"
+        depends on IDF_TARGET_ESP32C3
    config BOARD_TYPE_DF_K10
        bool "DFRobot 行空板 k10"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_DF_S3_AI_CAM
        bool "DFRobot ESP32-S3 AI智能摄像头模块"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_MAGICLICK_2P4
        bool "神奇按钮 Magiclick_2.4"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_MAGICLICK_2P5
        bool "神奇按钮 Magiclick_2.5"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_MAGICLICK_C3
        bool "神奇按钮 Magiclick_C3"
+        depends on IDF_TARGET_ESP32C3
    config BOARD_TYPE_MAGICLICK_C3_V2
        bool "神奇按钮 Magiclick_C3_v2"
+        depends on IDF_TARGET_ESP32C3
    config BOARD_TYPE_M5STACK_CORE_S3
        bool "M5Stack CoreS3"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_M5STACK_CORE_TAB5
        bool "M5Stack Tab5"
+        depends on IDF_TARGET_ESP32P4
    config BOARD_TYPE_ATOMS3_ECHO_BASE
        bool "AtomS3 + Echo Base"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_ATOMS3R_ECHO_BASE
        bool "AtomS3R + Echo Base"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_ATOMS3R_CAM_M12_ECHO_BASE
        bool "AtomS3R CAM/M12 + Echo Base"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_ATOMMATRIX_ECHO_BASE
        bool "AtomMatrix + Echo Base"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_ESP32S3_Touch_AMOLED_1_8
        bool "Waveshare ESP32-S3-Touch-AMOLED-1.8"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_ESP32S3_Touch_AMOLED_1_75
        bool "Waveshare ESP32-S3-Touch-AMOLED-1.75"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_ESP32S3_Touch_LCD_1_85C
        bool "Waveshare ESP32-S3-Touch-LCD-1.85C"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_ESP32S3_Touch_LCD_1_85
        bool "Waveshare ESP32-S3-Touch-LCD-1.85"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_ESP32S3_Touch_LCD_1_46
        bool "Waveshare ESP32-S3-Touch-LCD-1.46"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_ESP32S3_Touch_LCD_3_5
        bool "Waveshare ESP32-S3-Touch-LCD-3.5"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_ESP32P4_NANO
        bool "Waveshare ESP32-P4-NANO"
+        depends on IDF_TARGET_ESP32P4
    config BOARD_TYPE_ESP32P4_WIFI6_Touch_LCD_4B
        bool "Waveshare ESP32-P4-WIFI6-Touch-LCD-4B"
+        depends on IDF_TARGET_ESP32P4
    config BOARD_TYPE_ESP32P4_WIFI6_Touch_LCD_XC
        bool "Waveshare ESP32-P4-WIFI6-Touch-LCD-3.4C or ESP32-P4-WIFI6-Touch-LCD-4C"
+        depends on IDF_TARGET_ESP32P4
    config BOARD_TYPE_TUDOUZI
        bool "土豆子"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_LILYGO_T_CIRCLE_S3
        bool "LILYGO T-Circle-S3"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_LILYGO_T_CAMERAPLUS_S3_V1_0_V1_1
        bool "LILYGO T-CameraPlus-S3_V1_0_V1_1"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_LILYGO_T_CAMERAPLUS_S3_V1_2
        bool "LILYGO T-CameraPlus-S3_V1_2"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_LILYGO_T_DISPLAY_S3_PRO_MVSRLORA
        bool "LILYGO T-Display-S3-Pro-MVSRLora"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_LILYGO_T_DISPLAY_S3_PRO_MVSRLORA_NO_BATTERY
        bool "LILYGO T-Display-S3-Pro-MVSRLora_No_Battery"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_MOVECALL_MOJI_ESP32S3
        bool "Movecall Moji 小智AI衍生版"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_MOVECALL_CUICAN_ESP32S3
        bool "Movecall CuiCan 璀璨·AI吊坠"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_ATK_DNESP32S3
        bool "正点原子DNESP32S3开发板"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_ATK_DNESP32S3_BOX
        bool "正点原子DNESP32S3-BOX"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_ATK_DNESP32S3_BOX0
        bool "正点原子DNESP32S3-BOX0"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_ATK_DNESP32S3M_WIFI
        bool "正点原子DNESP32S3M-WIFI"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_ATK_DNESP32S3M_4G
        bool "正点原子DNESP32S3M-4G"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_DU_CHATX
        bool "嘟嘟开发板CHATX(wifi)"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_ESP32S3_Taiji_Pi
        bool "太极小派esp32s3"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_XINGZHI_Cube_0_85TFT_WIFI
        bool "无名科技星智0.85(WIFI)"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_XINGZHI_Cube_0_85TFT_ML307
        bool "无名科技星智0.85(ML307)"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_XINGZHI_Cube_0_96OLED_WIFI
        bool "无名科技星智0.96(WIFI)"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_XINGZHI_Cube_0_96OLED_ML307
        bool "无名科技星智0.96(ML307)"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_XINGZHI_Cube_1_54TFT_WIFI
        bool "无名科技星智1.54(WIFI)"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_XINGZHI_Cube_1_54TFT_ML307
        bool "无名科技星智1.54(ML307)"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_SENSECAP_WATCHER
        bool "SenseCAP Watcher"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_DOIT_S3_AIBOX
        bool "四博智联AI陪伴盒子"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_MIXGO_NOVA
        bool "元控·青春"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_GENJUTECH_S3_1_54TFT
        bool "亘具科技1.54(s3)"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_ESP_S3_LCD_EV_Board
        bool "乐鑫ESP S3 LCD EV Board开发板"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_ZHENGCHEN_1_54TFT_WIFI
        bool "征辰科技1.54(WIFI)"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_ZHENGCHEN_1_54TFT_ML307
        bool "征辰科技1.54(ML307)"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_MINSI_K08_DUAL
        bool "敏思科技K08(DUAL)"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_ESP32_S3_1_54_MUMA
        bool "Spotpear ESP32-S3-1.54-MUMA"
+        depends on IDF_TARGET_ESP32S3
    config BOARD_TYPE_ESP32_S3_1_28_BOX
        bool "Spotpear ESP32-S3-1.28-BOX"
+        depends on IDF_TARGET_ESP32S3
 endchoice

 choice ESP_S3_LCD_EV_Board_Version_TYPE
@ -270,24 +344,26 @@ config USE_WECHAT_MESSAGE_STYLE
    help
        使用微信聊天界面风格

-config USE_WAKE_WORD_DETECT_NO_AFE
+config USE_ESP_WAKE_WORD
    bool "Enable Wake Word Detection (without AFE)"
    default y
    depends on IDF_TARGET_ESP32C3 || IDF_TARGET_ESP32C5
-
-config USE_WAKE_WORD_DETECT
-    bool "Enable Wake Word Detection"
-    default y
-    depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 && SPIRAM
    help
-        需要 ESP32 S3 与 AFE 支持
+        支持 ESP32 C3 与 ESP32 C5
+
+config USE_AFE_WAKE_WORD
+    bool "Enable Wake Word Detection (AFE)"
+    default n
+    depends on (IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4) && SPIRAM
+    help
+        需要 ESP32 S3 与 PSRAM 支持

 config USE_AUDIO_PROCESSOR
    bool "Enable Audio Noise Reduction"
    default y
-    depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 && SPIRAM
+    depends on (IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4) && SPIRAM
    help
-        需要 ESP32 S3 与 AFE 支持
+        需要 ESP32 S3 与 PSRAM 支持

 config USE_DEVICE_AEC
    bool "Enable Device-Side AEC"
@ -297,7 +373,7 @@ config USE_DEVICE_AEC
        因为性能不够，不建议和微信聊天界面风格同时开启

 config USE_SERVER_AEC
-    bool "Enable Server-Side AEC"
+    bool "Enable Server-Side AEC (Unstable)"
    default n
    depends on USE_AUDIO_PROCESSOR
    help
--- a/main/application.cc
+++ b/main/application.cc
@ -14,7 +14,15 @@
 #if CONFIG_USE_AUDIO_PROCESSOR
 #include "afe_audio_processor.h"
 #else
-#include "dummy_audio_processor.h"
+#include "no_audio_processor.h"
+#endif
+
+#if CONFIG_USE_AFE_WAKE_WORD
+#include "afe_wake_word.h"
+#elif CONFIG_USE_ESP_WAKE_WORD
+#include "esp_wake_word.h"
+#else
+#include "no_wake_word.h"
 #endif

 #include <cstring>
@ -55,7 +63,15 @@ Application::Application() {
 #if CONFIG_USE_AUDIO_PROCESSOR
    audio_processor_ = std::make_unique<AfeAudioProcessor>();
 #else
-    audio_processor_ = std::make_unique<DummyAudioProcessor>();
+    audio_processor_ = std::make_unique<NoAudioProcessor>();
+#endif
+
+#if CONFIG_USE_AFE_WAKE_WORD
+    wake_word_ = std::make_unique<AfeWakeWord>();
+#elif CONFIG_USE_ESP_WAKE_WORD
+    wake_word_ = std::make_unique<EspWakeWord>();
+#else
+    wake_word_ = std::make_unique<NoWakeWord>();
 #endif

    esp_timer_create_args_t clock_timer_args = {
@ -129,9 +145,7 @@ void Application::CheckNewVersion() {

            auto& board = Board::GetInstance();
            board.SetPowerSaveMode(false);
-#if CONFIG_USE_WAKE_WORD_DETECT || CONFIG_USE_WAKE_WORD_DETECT_NO_AFE
-            wake_word_detect_.StopDetection();
-#endif
+            wake_word_->StopDetection();
            // 预先关闭音频输出，避免升级过程有音频操作
            auto codec = board.GetAudioCodec();
            codec->EnableInput(false);
@ -256,8 +270,6 @@ void Application::PlaySound(const std::string_view& sound) {
    }
    background_task_->WaitForCompletion();

-    // The assets are encoded at 16000Hz, 60ms frame duration
-    SetDecodeSampleRate(16000, 60);
    const char* data = sound.data();
    size_t size = sound.size();
    for (const char* p = data; p < data + size; ) {
@ -266,6 +278,8 @@ void Application::PlaySound(const std::string_view& sound) {

        auto payload_size = ntohs(p3->payload_size);
        AudioStreamPacket packet;
+        packet.sample_rate = 16000;
+        packet.frame_duration = 60;
        packet.payload.resize(payload_size);
        memcpy(packet.payload.data(), p3->payload, payload_size);
        p += payload_size;
@ -432,7 +446,7 @@ void Application::Start() {
    });
    protocol_->OnIncomingAudio([this](AudioStreamPacket&& packet) {
        std::lock_guard<std::mutex> lock(mutex_);
-        if (audio_decode_queue_.size() < MAX_AUDIO_PACKETS_IN_QUEUE) {
+        if (device_state_ == kDeviceStateSpeaking && audio_decode_queue_.size() < MAX_AUDIO_PACKETS_IN_QUEUE) {
            audio_decode_queue_.emplace_back(std::move(packet));
        }
    });
@ -442,7 +456,6 @@ void Application::Start() {
            ESP_LOGW(TAG, "Server sample rate %d does not match device output sample rate %d, resampling may cause distortion",
                protocol_->server_sample_rate(), codec->output_sample_rate());
        }
-        SetDecodeSampleRate(protocol_->server_sample_rate(), protocol_->server_frame_duration());

 #if CONFIG_IOT_PROTOCOL_XIAOZHI
        auto& thing_manager = iot::ThingManager::GetInstance();
@ -600,28 +613,40 @@ void Application::Start() {
        }
    });

-#if CONFIG_USE_WAKE_WORD_DETECT || CONFIG_USE_WAKE_WORD_DETECT_NO_AFE
-    wake_word_detect_.Initialize(codec);
-#ifdef CONFIG_USE_WAKE_WORD_DETECT
-    wake_word_detect_.OnWakeWordDetected([this](const std::string& wake_word) {
+    wake_word_->Initialize(codec);
+    wake_word_->OnWakeWordDetected([this](const std::string& wake_word) {
        Schedule([this, &wake_word]() {
-            if (device_state_ == kDeviceStateIdle) {
-                SetDeviceState(kDeviceStateConnecting);
-                wake_word_detect_.EncodeWakeWordData();
+            if (!protocol_) {
+                return;
+            }

-                if (!protocol_ || !protocol_->OpenAudioChannel()) {
-                    wake_word_detect_.StartDetection();
-                    return;
+            if (device_state_ == kDeviceStateIdle) {
+                wake_word_->EncodeWakeWordData();
+
+                if (!protocol_->IsAudioChannelOpened()) {
+                    SetDeviceState(kDeviceStateConnecting);
+                    if (!protocol_->OpenAudioChannel()) {
+                        wake_word_->StartDetection();
+                        return;
+                    }
                }
-                
+
+                ESP_LOGI(TAG, "Wake word detected: %s", wake_word.c_str());
+#if CONFIG_USE_AFE_WAKE_WORD
                AudioStreamPacket packet;
                // Encode and send the wake word data to the server
-                while (wake_word_detect_.GetWakeWordOpus(packet.payload)) {
+                while (wake_word_->GetWakeWordOpus(packet.payload)) {
                    protocol_->SendAudio(packet);
                }
                // Set the chat state to wake word detected
                protocol_->SendWakeWordDetected(wake_word);
-                ESP_LOGI(TAG, "Wake word detected: %s", wake_word.c_str());
+#else
+                // Play the pop up sound to indicate the wake word is detected
+                // And wait 60ms to make sure the queue has been processed by audio task
+                ResetDecoder();
+                PlaySound(Lang::Sounds::P3_POPUP);
+                vTaskDelay(pdMS_TO_TICKS(60));
+#endif
                SetListeningMode(aec_mode_ == kAecOff ? kListeningModeAutoStop : kListeningModeRealtime);
            } else if (device_state_ == kDeviceStateSpeaking) {
                AbortSpeaking(kAbortReasonWakeWordDetected);
@ -630,9 +655,7 @@ void Application::Start() {
            }
        });
    });
-#endif
-    wake_word_detect_.StartDetection();
-#endif
+    wake_word_->StartDetection();

    // Wait for the new version check to finish
    xEventGroupWaitBits(event_group_, CHECK_NEW_VERSION_DONE_EVENT, pdTRUE, pdFALSE, portMAX_DELAY);
@ -751,17 +774,14 @@ void Application::OnAudioOutput() {
        return;
    }

-    if (device_state_ == kDeviceStateListening) {
-        audio_decode_queue_.clear();
-        audio_decode_cv_.notify_all();
-        return;
-    }
-
    auto packet = std::move(audio_decode_queue_.front());
    audio_decode_queue_.pop_front();
    lock.unlock();
    audio_decode_cv_.notify_all();

+    // Synchronize the sample rate and frame duration
+    SetDecodeSampleRate(packet.sample_rate, packet.frame_duration);
+
    busy_decoding_audio_ = true;
    background_task_->Schedule([this, codec, packet = std::move(packet)]() mutable {
        busy_decoding_audio_ = false;
@ -782,45 +802,48 @@ void Application::OnAudioOutput() {
        }
        codec->OutputData(pcm);
 #ifdef CONFIG_USE_SERVER_AEC
-            std::lock_guard<std::mutex> lock(timestamp_mutex_);
-            timestamp_queue_.push_back(packet.timestamp);
-            last_output_timestamp_ = packet.timestamp;
+        std::lock_guard<std::mutex> lock(timestamp_mutex_);
+        timestamp_queue_.push_back(packet.timestamp);
 #endif
        last_output_time_ = std::chrono::steady_clock::now();
    });
 }

 void Application::OnAudioInput() {
-#if CONFIG_USE_WAKE_WORD_DETECT || CONFIG_USE_WAKE_WORD_DETECT_NO_AFE
-    if (wake_word_detect_.IsDetectionRunning()) {
+    if (wake_word_->IsDetectionRunning()) {
        std::vector<int16_t> data;
-        int samples = wake_word_detect_.GetFeedSize();
+        int samples = wake_word_->GetFeedSize();
        if (samples > 0) {
-            ReadAudio(data, 16000, samples);
-            wake_word_detect_.Feed(data);
-            return;
+            if (ReadAudio(data, 16000, samples)) {
+                wake_word_->Feed(data);
+                return;
+            }
        }
    }
-#endif
    if (audio_processor_->IsRunning()) {
        std::vector<int16_t> data;
        int samples = audio_processor_->GetFeedSize();
        if (samples > 0) {
-            ReadAudio(data, 16000, samples);
-            audio_processor_->Feed(data);
-            return;
+            if (ReadAudio(data, 16000, samples)) {
+                audio_processor_->Feed(data);
+                return;
+            }
        }
    }

    vTaskDelay(pdMS_TO_TICKS(OPUS_FRAME_DURATION_MS / 2));
 }

-void Application::ReadAudio(std::vector<int16_t>& data, int sample_rate, int samples) {
+bool Application::ReadAudio(std::vector<int16_t>& data, int sample_rate, int samples) {
    auto codec = Board::GetInstance().GetAudioCodec();
+    if (!codec->input_enabled()) {
+        return false;
+    }
+
    if (codec->input_sample_rate() != sample_rate) {
        data.resize(samples * codec->input_sample_rate() / sample_rate);
        if (!codec->InputData(data)) {
-            return;
+            return false;
        }
        if (codec->input_channels() == 2) {
            auto mic_channel = std::vector<int16_t>(data.size() / 2);
@ -846,9 +869,10 @@ void Application::ReadAudio(std::vector<int16_t>& data, int sample_rate, int sam
    } else {
        data.resize(samples);
        if (!codec->InputData(data)) {
-            return;
+            return false;
        }
    }
+    return true;
 }

 void Application::AbortSpeaking(AbortReason reason) {
@ -884,17 +908,13 @@ void Application::SetDeviceState(DeviceState state) {
            display->SetStatus(Lang::Strings::STANDBY);
            display->SetEmotion("neutral");
            audio_processor_->Stop();
-            
-#if CONFIG_USE_WAKE_WORD_DETECT || CONFIG_USE_WAKE_WORD_DETECT_NO_AFE
-            wake_word_detect_.StartDetection();
-#endif
+            wake_word_->StartDetection();
            break;
        case kDeviceStateConnecting:
            display->SetStatus(Lang::Strings::CONNECTING);
            display->SetEmotion("neutral");
            display->SetChatMessage("system", "");
            timestamp_queue_.clear();
-            last_output_timestamp_ = 0;
            break;
        case kDeviceStateListening:
            display->SetStatus(Lang::Strings::LISTENING);
@ -909,14 +929,14 @@ void Application::SetDeviceState(DeviceState state) {
                // Send the start listening command
                protocol_->SendStartListening(listening_mode_);
                if (previous_state == kDeviceStateSpeaking) {
+                    audio_decode_queue_.clear();
+                    audio_decode_cv_.notify_all();
                    // FIXME: Wait for the speaker to empty the buffer
                    vTaskDelay(pdMS_TO_TICKS(120));
                }
                opus_encoder_->ResetState();
-#if CONFIG_USE_WAKE_WORD_DETECT || CONFIG_USE_WAKE_WORD_DETECT_NO_AFE
-                wake_word_detect_.StopDetection();
-#endif
                audio_processor_->Start();
+                wake_word_->StopDetection();
            }
            break;
        case kDeviceStateSpeaking:
@ -924,8 +944,11 @@ void Application::SetDeviceState(DeviceState state) {

            if (listening_mode_ != kListeningModeRealtime) {
                audio_processor_->Stop();
-#if CONFIG_USE_WAKE_WORD_DETECT || CONFIG_USE_WAKE_WORD_DETECT_NO_AFE
-                wake_word_detect_.StartDetection();
+                // Only AFE wake word can be detected in speaking mode
+#if CONFIG_USE_AFE_WAKE_WORD
+                wake_word_->StartDetection();
+#else
+                wake_word_->StopDetection();
 #endif
            }
            ResetDecoder();
--- a/main/application.h
+++ b/main/application.h
@ -21,12 +21,7 @@
 #include "ota.h"
 #include "background_task.h"
 #include "audio_processor.h"
-
-#if CONFIG_USE_WAKE_WORD_DETECT
-#include "wake_word_detect.h"
-#elif CONFIG_USE_WAKE_WORD_DETECT_NO_AFE
-#include "wake_word_no_afe.h"
-#endif
+#include "wake_word.h"

 #define SCHEDULE_EVENT (1 << 0)
 #define SEND_AUDIO_EVENT (1 << 1)
@ -83,14 +78,13 @@ public:
    void SendMcpMessage(const std::string& payload);
    void SetAecMode(AecMode mode);
    AecMode GetAecMode() const { return aec_mode_; }
+    BackgroundTask* GetBackgroundTask() const { return background_task_; }

 private:
    Application();
    ~Application();

-#if CONFIG_USE_WAKE_WORD_DETECT || CONFIG_USE_WAKE_WORD_DETECT_NO_AFE
-    WakeWordDetect wake_word_detect_;
-#endif
+    std::unique_ptr<WakeWord> wake_word_;
    std::unique_ptr<AudioProcessor> audio_processor_;
    Ota ota_;
    std::mutex mutex_;
@ -119,7 +113,6 @@ private:
    // 新增：用于维护音频包的timestamp队列
    std::list<uint32_t> timestamp_queue_;
    std::mutex timestamp_mutex_;
-    std::atomic<uint32_t> last_output_timestamp_ = 0;

    std::unique_ptr<OpusEncoderWrapper> opus_encoder_;
    std::unique_ptr<OpusDecoderWrapper> opus_decoder_;
@ -131,7 +124,7 @@ private:
    void MainEventLoop();
    void OnAudioInput();
    void OnAudioOutput();
-    void ReadAudio(std::vector<int16_t>& data, int sample_rate, int samples);
+    bool ReadAudio(std::vector<int16_t>& data, int sample_rate, int samples);
    void ResetDecoder();
    void SetDecodeSampleRate(int sample_rate, int frame_duration);
    void CheckNewVersion();
--- a/main/assets/common/popup.p3
+++ b/main/assets/common/popup.p3
--- a/main/audio_processing/afe_audio_processor.cc
+++ b/main/audio_processing/afe_audio_processor.cc
@ -3,7 +3,7 @@

 #define PROCESSOR_RUNNING 0x01

-static const char* TAG = "AfeAudioProcessor";
+#define TAG "AfeAudioProcessor"

 AfeAudioProcessor::AfeAudioProcessor()
    : afe_data_(nullptr) {
--- a/main/audio_processing/wake_word_detect.cc
+++ b/main/audio_processing/wake_word_detect.cc
@ -1,4 +1,4 @@
-#include "wake_word_detect.h"
+#include "afe_wake_word.h"
 #include "application.h"

 #include <esp_log.h>
@ -8,9 +8,9 @@

 #define DETECTION_RUNNING_EVENT 1

-static const char* TAG = "WakeWordDetect";
+#define TAG "AfeWakeWord"

-WakeWordDetect::WakeWordDetect()
+AfeWakeWord::AfeWakeWord()
    : afe_data_(nullptr),
      wake_word_pcm_(),
      wake_word_opus_() {
@ -18,7 +18,7 @@ WakeWordDetect::WakeWordDetect()
    event_group_ = xEventGroupCreate();
 }

-WakeWordDetect::~WakeWordDetect() {
+AfeWakeWord::~AfeWakeWord() {
    if (afe_data_ != nullptr) {
        afe_iface_->destroy(afe_data_);
    }
@ -30,7 +30,7 @@ WakeWordDetect::~WakeWordDetect() {
    vEventGroupDelete(event_group_);
 }

-void WakeWordDetect::Initialize(AudioCodec* codec) {
+void AfeWakeWord::Initialize(AudioCodec* codec) {
    codec_ = codec;
    int ref_num = codec_->input_reference() ? 1 : 0;

@ -67,46 +67,46 @@ void WakeWordDetect::Initialize(AudioCodec* codec) {
    afe_data_ = afe_iface_->create_from_config(afe_config);

    xTaskCreate([](void* arg) {
-        auto this_ = (WakeWordDetect*)arg;
+        auto this_ = (AfeWakeWord*)arg;
        this_->AudioDetectionTask();
        vTaskDelete(NULL);
    }, "audio_detection", 4096, this, 3, nullptr);
 }

-void WakeWordDetect::OnWakeWordDetected(std::function<void(const std::string& wake_word)> callback) {
+void AfeWakeWord::OnWakeWordDetected(std::function<void(const std::string& wake_word)> callback) {
    wake_word_detected_callback_ = callback;
 }

-void WakeWordDetect::StartDetection() {
+void AfeWakeWord::StartDetection() {
    xEventGroupSetBits(event_group_, DETECTION_RUNNING_EVENT);
 }

-void WakeWordDetect::StopDetection() {
+void AfeWakeWord::StopDetection() {
    xEventGroupClearBits(event_group_, DETECTION_RUNNING_EVENT);
    if (afe_data_ != nullptr) {
        afe_iface_->reset_buffer(afe_data_);
    }
 }

-bool WakeWordDetect::IsDetectionRunning() {
+bool AfeWakeWord::IsDetectionRunning() {
    return xEventGroupGetBits(event_group_) & DETECTION_RUNNING_EVENT;
 }

-void WakeWordDetect::Feed(const std::vector<int16_t>& data) {
+void AfeWakeWord::Feed(const std::vector<int16_t>& data) {
    if (afe_data_ == nullptr) {
        return;
    }
    afe_iface_->feed(afe_data_, data.data());
 }

-size_t WakeWordDetect::GetFeedSize() {
+size_t AfeWakeWord::GetFeedSize() {
    if (afe_data_ == nullptr) {
        return 0;
    }
    return afe_iface_->get_feed_chunksize(afe_data_) * codec_->input_channels();
 }

-void WakeWordDetect::AudioDetectionTask() {
+void AfeWakeWord::AudioDetectionTask() {
    auto fetch_size = afe_iface_->get_fetch_chunksize(afe_data_);
    auto feed_size = afe_iface_->get_feed_chunksize(afe_data_);
    ESP_LOGI(TAG, "Audio detection task started, feed size: %d fetch size: %d",
@ -121,7 +121,7 @@ void WakeWordDetect::AudioDetectionTask() {
        }

        // Store the wake word data for voice recognition, like who is speaking
-        StoreWakeWordData((uint16_t*)res->data, res->data_size / sizeof(uint16_t));
+        StoreWakeWordData(res->data, res->data_size / sizeof(int16_t));

        if (res->wakeup_state == WAKENET_DETECTED) {
            StopDetection();
@ -134,7 +134,7 @@ void WakeWordDetect::AudioDetectionTask() {
    }
 }

-void WakeWordDetect::StoreWakeWordData(uint16_t* data, size_t samples) {
+void AfeWakeWord::StoreWakeWordData(const int16_t* data, size_t samples) {
    // store audio data to wake_word_pcm_
    wake_word_pcm_.emplace_back(std::vector<int16_t>(data, data + samples));
    // keep about 2 seconds of data, detect duration is 30ms (sample_rate == 16000, chunksize == 512)
@ -143,13 +143,13 @@ void WakeWordDetect::StoreWakeWordData(uint16_t* data, size_t samples) {
    }
 }

-void WakeWordDetect::EncodeWakeWordData() {
+void AfeWakeWord::EncodeWakeWordData() {
    wake_word_opus_.clear();
    if (wake_word_encode_task_stack_ == nullptr) {
        wake_word_encode_task_stack_ = (StackType_t*)heap_caps_malloc(4096 * 8, MALLOC_CAP_SPIRAM);
    }
    wake_word_encode_task_ = xTaskCreateStatic([](void* arg) {
-        auto this_ = (WakeWordDetect*)arg;
+        auto this_ = (AfeWakeWord*)arg;
        {
            auto start_time = esp_timer_get_time();
            auto encoder = std::make_unique<OpusEncoderWrapper>(16000, 1, OPUS_FRAME_DURATION_MS);
@ -176,7 +176,7 @@ void WakeWordDetect::EncodeWakeWordData() {
    }, "encode_detect_packets", 4096 * 8, this, 2, wake_word_encode_task_stack_, &wake_word_encode_task_buffer_);
 }

-bool WakeWordDetect::GetWakeWordOpus(std::vector<uint8_t>& opus) {
+bool AfeWakeWord::GetWakeWordOpus(std::vector<uint8_t>& opus) {
    std::unique_lock<std::mutex> lock(wake_word_mutex_);
    wake_word_cv_.wait(lock, [this]() {
        return !wake_word_opus_.empty();
--- a/main/audio_processing/wake_word_detect.h
+++ b/main/audio_processing/wake_word_detect.h
@ -1,5 +1,5 @@
-#ifndef WAKE_WORD_DETECT_H
-#define WAKE_WORD_DETECT_H
+#ifndef AFE_WAKE_WORD_H
+#define AFE_WAKE_WORD_H

 #include <freertos/FreeRTOS.h>
 #include <freertos/task.h>
@ -16,11 +16,12 @@
 #include <condition_variable>

 #include "audio_codec.h"
+#include "wake_word.h"

-class WakeWordDetect {
+class AfeWakeWord : public WakeWord {
 public:
-    WakeWordDetect();
-    ~WakeWordDetect();
+    AfeWakeWord();
+    ~AfeWakeWord();

    void Initialize(AudioCodec* codec);
    void Feed(const std::vector<int16_t>& data);
@ -51,7 +52,7 @@ private:
    std::mutex wake_word_mutex_;
    std::condition_variable wake_word_cv_;

-    void StoreWakeWordData(uint16_t* data, size_t size);
+    void StoreWakeWordData(const int16_t* data, size_t size);
    void AudioDetectionTask();
 };

--- a/main/audio_processing/wake_word_no_afe.cc
+++ b/main/audio_processing/wake_word_no_afe.cc
@ -1,4 +1,4 @@
-#include "wake_word_no_afe.h"
+#include "esp_wake_word.h"
 #include "application.h"

 #include <esp_log.h>
@ -8,13 +8,13 @@

 #define DETECTION_RUNNING_EVENT 1

-static const char* TAG = "WakeWordDetect";
+#define TAG "EspWakeWord"

-WakeWordDetect::WakeWordDetect() {
+EspWakeWord::EspWakeWord() {
    event_group_ = xEventGroupCreate();
 }

-WakeWordDetect::~WakeWordDetect() {
+EspWakeWord::~EspWakeWord() {
    if (wakenet_data_ != nullptr) {
        wakenet_iface_->destroy(wakenet_data_);
        esp_srmodel_deinit(wakenet_model_);
@ -23,13 +23,16 @@ WakeWordDetect::~WakeWordDetect() {
    vEventGroupDelete(event_group_);
 }

-void WakeWordDetect::Initialize(AudioCodec* codec) {
+void EspWakeWord::Initialize(AudioCodec* codec) {
    codec_ = codec;

    wakenet_model_ = esp_srmodel_init("model");

    if(wakenet_model_->num > 1) {
        ESP_LOGW(TAG, "More than one model found, using the first one");
+    } else if (wakenet_model_->num == 0) {
+        ESP_LOGE(TAG, "No model found");
+        return;
    }
    char *model_name = wakenet_model_->model_name[0];
    wakenet_iface_ = (esp_wn_iface_t*)esp_wn_handle_from_name(model_name);
@ -40,28 +43,46 @@ void WakeWordDetect::Initialize(AudioCodec* codec) {
    ESP_LOGI(TAG, "Wake word(%s),freq: %d, chunksize: %d", model_name, frequency, audio_chunksize);
 }

-void WakeWordDetect::StartDetection() {
+void EspWakeWord::OnWakeWordDetected(std::function<void(const std::string& wake_word)> callback) {
+    wake_word_detected_callback_ = callback;
+}
+
+void EspWakeWord::StartDetection() {
+    ESP_LOGI(TAG, "Start wake word detection");
    xEventGroupSetBits(event_group_, DETECTION_RUNNING_EVENT);
 }

-void WakeWordDetect::StopDetection() {
+void EspWakeWord::StopDetection() {
+    ESP_LOGI(TAG, "Stop wake word detection");
    xEventGroupClearBits(event_group_, DETECTION_RUNNING_EVENT);
 }

-bool WakeWordDetect::IsDetectionRunning() {
+bool EspWakeWord::IsDetectionRunning() {
    return xEventGroupGetBits(event_group_) & DETECTION_RUNNING_EVENT;
 }

-void WakeWordDetect::Feed(const std::vector<int16_t>& data) {
+void EspWakeWord::Feed(const std::vector<int16_t>& data) {
    int res = wakenet_iface_->detect(wakenet_data_, (int16_t *)data.data());
    if (res > 0) {
-        ESP_LOGI(TAG, "Wake word detected");
-        auto& app = Application::GetInstance();
-        app.ToggleChatState();
+        StopDetection();
+        last_detected_wake_word_ = wakenet_iface_->get_word_name(wakenet_data_, res);
+
+        if (wake_word_detected_callback_) {
+            wake_word_detected_callback_(last_detected_wake_word_);
+        }
    }
 }

-size_t WakeWordDetect::GetFeedSize() {
-
+size_t EspWakeWord::GetFeedSize() {
+    if (wakenet_data_ == nullptr) {
+        return 0;
+    }
    return wakenet_iface_->get_samp_chunksize(wakenet_data_) * codec_->input_channels();
 }
+
+void EspWakeWord::EncodeWakeWordData() {
+}
+
+bool EspWakeWord::GetWakeWordOpus(std::vector<uint8_t>& opus) {
+    return false;
+}
--- a/main/audio_processing/wake_word_no_afe.h
+++ b/main/audio_processing/wake_word_no_afe.h
@ -1,13 +1,13 @@
-#ifndef WAKE_WORD_DETECT_H
-#define WAKE_WORD_DETECT_H
+#ifndef ESP_WAKE_WORD_H
+#define ESP_WAKE_WORD_H

 #include <freertos/FreeRTOS.h>
 #include <freertos/task.h>
 #include <freertos/event_groups.h>

-#include "model_path.h"
-#include "esp_wn_iface.h"
-#include "esp_wn_models.h"
+#include <esp_wn_iface.h>
+#include <esp_wn_models.h>
+#include <model_path.h>

 #include <list>
 #include <string>
@ -17,19 +17,23 @@
 #include <condition_variable>

 #include "audio_codec.h"
-#include <model_path.h>
+#include "wake_word.h"

-class WakeWordDetect {
+class EspWakeWord : public WakeWord {
 public:
-    WakeWordDetect();
-    ~WakeWordDetect();
+    EspWakeWord();
+    ~EspWakeWord();

    void Initialize(AudioCodec* codec);
    void Feed(const std::vector<int16_t>& data);
+    void OnWakeWordDetected(std::function<void(const std::string& wake_word)> callback);
    void StartDetection();
    void StopDetection();
    bool IsDetectionRunning();
    size_t GetFeedSize();
+    void EncodeWakeWordData();
+    bool GetWakeWordOpus(std::vector<uint8_t>& opus);
+    const std::string& GetLastDetectedWakeWord() const { return last_detected_wake_word_; }

 private:
    esp_wn_iface_t *wakenet_iface_ = nullptr;
@ -37,6 +41,9 @@ private:
    srmodel_list_t *wakenet_model_ = nullptr;
    EventGroupHandle_t event_group_;
    AudioCodec* codec_ = nullptr;
+
+    std::function<void(const std::string& wake_word)> wake_word_detected_callback_;
+    std::string last_detected_wake_word_;
 };

 #endif
--- a/main/audio_processing/dummy_audio_processor.cc
+++ b/main/audio_processing/dummy_audio_processor.cc
@ -1,13 +1,13 @@
-#include "dummy_audio_processor.h"
+#include "no_audio_processor.h"
 #include <esp_log.h>

-#define TAG "DummyAudioProcessor"
+#define TAG "NoAudioProcessor"

-void DummyAudioProcessor::Initialize(AudioCodec* codec) {
+void NoAudioProcessor::Initialize(AudioCodec* codec) {
    codec_ = codec;
 }

-void DummyAudioProcessor::Feed(const std::vector<int16_t>& data) {
+void NoAudioProcessor::Feed(const std::vector<int16_t>& data) {
    if (!is_running_ || !output_callback_) {
        return;
    }
@ -15,27 +15,27 @@ void DummyAudioProcessor::Feed(const std::vector<int16_t>& data) {
    output_callback_(std::vector<int16_t>(data));
 }

-void DummyAudioProcessor::Start() {
+void NoAudioProcessor::Start() {
    is_running_ = true;
 }

-void DummyAudioProcessor::Stop() {
+void NoAudioProcessor::Stop() {
    is_running_ = false;
 }

-bool DummyAudioProcessor::IsRunning() {
+bool NoAudioProcessor::IsRunning() {
    return is_running_;
 }

-void DummyAudioProcessor::OnOutput(std::function<void(std::vector<int16_t>&& data)> callback) {
+void NoAudioProcessor::OnOutput(std::function<void(std::vector<int16_t>&& data)> callback) {
    output_callback_ = callback;
 }

-void DummyAudioProcessor::OnVadStateChange(std::function<void(bool speaking)> callback) {
+void NoAudioProcessor::OnVadStateChange(std::function<void(bool speaking)> callback) {
    vad_state_change_callback_ = callback;
 }

-size_t DummyAudioProcessor::GetFeedSize() {
+size_t NoAudioProcessor::GetFeedSize() {
    if (!codec_) {
        return 0;
    }
@ -43,7 +43,7 @@ size_t DummyAudioProcessor::GetFeedSize() {
    return 30 * codec_->input_sample_rate() / 1000;
 }

-void DummyAudioProcessor::EnableDeviceAec(bool enable) {
+void NoAudioProcessor::EnableDeviceAec(bool enable) {
    if (enable) {
        ESP_LOGE(TAG, "Device AEC is not supported");
    }
--- a/main/audio_processing/dummy_audio_processor.h
+++ b/main/audio_processing/dummy_audio_processor.h
@ -7,10 +7,10 @@
 #include "audio_processor.h"
 #include "audio_codec.h"

-class DummyAudioProcessor : public AudioProcessor {
+class NoAudioProcessor : public AudioProcessor {
 public:
-    DummyAudioProcessor() = default;
-    ~DummyAudioProcessor() = default;
+    NoAudioProcessor() = default;
+    ~NoAudioProcessor() = default;

    void Initialize(AudioCodec* codec) override;
    void Feed(const std::vector<int16_t>& data) override;
--- a/main/audio_processing/no_wake_word.cc
+++ b/main/audio_processing/no_wake_word.cc
@ -0,0 +1,45 @@
+#include "no_wake_word.h"
+#include <esp_log.h>
+
+#define TAG "NoWakeWord"
+
+void NoWakeWord::Initialize(AudioCodec* codec) {
+    codec_ = codec;
+}
+
+void NoWakeWord::Feed(const std::vector<int16_t>& data) {
+    // Do nothing - no wake word processing
+}
+
+void NoWakeWord::OnWakeWordDetected(std::function<void(const std::string& wake_word)> callback) {
+    // Do nothing - no wake word processing
+}
+
+void NoWakeWord::StartDetection() {
+    // Do nothing - no wake word processing
+}
+
+void NoWakeWord::StopDetection() {
+    // Do nothing - no wake word processing
+}
+
+bool NoWakeWord::IsDetectionRunning() {
+    return false;  // No wake word processing
+}
+
+size_t NoWakeWord::GetFeedSize() {
+    return 0;  // No specific feed size requirement
+}
+
+void NoWakeWord::EncodeWakeWordData() {
+    // Do nothing - no encoding needed
+}
+
+bool NoWakeWord::GetWakeWordOpus(std::vector<uint8_t>& opus) {
+    opus.clear();
+    return false;  // No opus data available
+}
+
+const std::string& NoWakeWord::GetLastDetectedWakeWord() const {
+    return "";  // No wake word detected
+}
--- a/main/audio_processing/no_wake_word.h
+++ b/main/audio_processing/no_wake_word.h
@ -0,0 +1,31 @@
+#ifndef NO_WAKE_WORD_H
+#define NO_WAKE_WORD_H
+
+#include <vector>
+#include <functional>
+#include <string>
+
+#include "wake_word.h"
+#include "audio_codec.h"
+
+class NoWakeWord : public WakeWord {
+public:
+    NoWakeWord() = default;
+    ~NoWakeWord() = default;
+
+    void Initialize(AudioCodec* codec) override;
+    void Feed(const std::vector<int16_t>& data) override;
+    void OnWakeWordDetected(std::function<void(const std::string& wake_word)> callback) override;
+    void StartDetection() override;
+    void StopDetection() override;
+    bool IsDetectionRunning() override;
+    size_t GetFeedSize() override;
+    void EncodeWakeWordData() override;
+    bool GetWakeWordOpus(std::vector<uint8_t>& opus) override;
+    const std::string& GetLastDetectedWakeWord() const override;
+
+private:
+    AudioCodec* codec_ = nullptr;
+};
+
+#endif 
--- a/main/audio_processing/wake_word.h
+++ b/main/audio_processing/wake_word.h
@ -0,0 +1,26 @@
+#ifndef WAKE_WORD_H
+#define WAKE_WORD_H
+
+#include <string>
+#include <vector>
+#include <functional>
+
+#include "audio_codec.h"
+
+class WakeWord {
+public:
+    virtual ~WakeWord() = default;
+    
+    virtual void Initialize(AudioCodec* codec) = 0;
+    virtual void Feed(const std::vector<int16_t>& data) = 0;
+    virtual void OnWakeWordDetected(std::function<void(const std::string& wake_word)> callback) = 0;
+    virtual void StartDetection() = 0;
+    virtual void StopDetection() = 0;
+    virtual bool IsDetectionRunning() = 0;
+    virtual size_t GetFeedSize() = 0;
+    virtual void EncodeWakeWordData() = 0;
+    virtual bool GetWakeWordOpus(std::vector<uint8_t>& opus) = 0;
+    virtual const std::string& GetLastDetectedWakeWord() const = 0;
+};
+
+#endif
--- a/main/boards/esp-hi/config.json
+++ b/main/boards/esp-hi/config.json
@ -30,7 +30,8 @@
                "CONFIG_MBEDTLS_DYNAMIC_FREE_CONFIG_DATA=y",
                "CONFIG_NEWLIB_NANO_FORMAT=y",
                "CONFIG_MMAP_FILE_NAME_LENGTH=25",
-                "CONFIG_ESP_CONSOLE_NONE=y"
+                "CONFIG_ESP_CONSOLE_NONE=y",
+                "CONFIG_IOT_PROTOCOL_XIAOZHI=y"
            ]
        }
    ]
--- a/main/boards/genjutech-s3-1.54tft/genjutech-s3-1.54tft.cc
+++ b/main/boards/genjutech-s3-1.54tft/genjutech-s3-1.54tft.cc
@ -70,7 +70,7 @@ private:
    }

    void InitializePowerSaveTimer() {
-        power_save_timer_ = new PowerSaveTimer(160, 60);
+        power_save_timer_ = new PowerSaveTimer(240, 60);
        power_save_timer_->OnEnterSleepMode([this]() {
            ESP_LOGI(TAG, "Enabling sleep mode");
            auto display = GetDisplay();
--- a/main/boards/lichuang-c3-dev/config.json
+++ b/main/boards/lichuang-c3-dev/config.json
@ -5,7 +5,9 @@
            "name": "lichuang-c3-dev",
            "sdkconfig_append": [
                "CONFIG_ESPTOOLPY_FLASHSIZE_8MB=y",
-                "CONFIG_PARTITION_TABLE_CUSTOM_FILENAME=\"partitions_8M.csv\""
+                "CONFIG_PARTITION_TABLE_CUSTOM_FILENAME=\"partitions_8M.csv\"",
+                "CONFIG_ESP_WIFI_ENTERPRISE_SUPPORT=n",
+                "CONFIG_LWIP_IPV6=n"
            ]
        }
    ]
--- a/main/boards/magiclick-c3-v2/config.json
+++ b/main/boards/magiclick-c3-v2/config.json
@ -5,7 +5,8 @@
            "name": "magiclick-c3-v2",
            "sdkconfig_append": [
                "CONFIG_PM_ENABLE=y",
-                "CONFIG_FREERTOS_USE_TICKLESS_IDLE=y"
+                "CONFIG_FREERTOS_USE_TICKLESS_IDLE=y",
+                "CONFIG_USE_ESP_WAKE_WORD=n"
            ]
        }
    ]
--- a/main/boards/magiclick-c3/config.json
+++ b/main/boards/magiclick-c3/config.json
@ -5,7 +5,8 @@
            "name": "magiclick-c3",
            "sdkconfig_append": [
                "CONFIG_PM_ENABLE=y",
-                "CONFIG_FREERTOS_USE_TICKLESS_IDLE=y"
+                "CONFIG_FREERTOS_USE_TICKLESS_IDLE=y",
+                "CONFIG_USE_ESP_WAKE_WORD=n"
            ]
        }
    ]
--- a/main/boards/xmini-c3/config.json
+++ b/main/boards/xmini-c3/config.json
@ -5,7 +5,8 @@
            "name": "xmini-c3",
            "sdkconfig_append": [
                "CONFIG_PM_ENABLE=y",
-                "CONFIG_FREERTOS_USE_TICKLESS_IDLE=y"
+                "CONFIG_FREERTOS_USE_TICKLESS_IDLE=y",
+                "CONFIG_USE_ESP_WAKE_WORD=y"
            ]
        }
    ]
--- a/main/boards/xmini-c3/xmini_c3_board.cc
+++ b/main/boards/xmini-c3/xmini_c3_board.cc
@ -30,10 +30,10 @@ private:
    Display* display_ = nullptr;
    Button boot_button_;
    bool press_to_talk_enabled_ = false;
-    PowerSaveTimer* power_save_timer_;
+    PowerSaveTimer* power_save_timer_ = nullptr;

    void InitializePowerSaveTimer() {
-        power_save_timer_ = new PowerSaveTimer(160, 60);
+        power_save_timer_ = new PowerSaveTimer(160, 600);
        power_save_timer_->OnEnterSleepMode([this]() {
            ESP_LOGI(TAG, "Enabling sleep mode");
            auto display = GetDisplay();
@ -130,7 +130,9 @@ private:
            }
        });
        boot_button_.OnPressDown([this]() {
-            power_save_timer_->WakeUp();
+            if (power_save_timer_) {
+                power_save_timer_->WakeUp();
+            }
            if (press_to_talk_enabled_) {
                Application::GetInstance().StartListening();
            }
--- a/main/protocols/mqtt_protocol.cc
+++ b/main/protocols/mqtt_protocol.cc
@ -227,6 +227,8 @@ bool MqttProtocol::OpenAudioChannel() {
        auto nonce = (uint8_t*)data.data();
        auto encrypted = (uint8_t*)data.data() + aes_nonce_.size();
        AudioStreamPacket packet;
+        packet.sample_rate = server_sample_rate_;
+        packet.frame_duration = server_frame_duration_;
        packet.timestamp = timestamp;
        packet.payload.resize(decrypted_size);
        int ret = mbedtls_aes_crypt_ctr(&aes_ctx_, decrypted_size, &nc_off, nonce, stream_block, encrypted, (uint8_t*)packet.payload.data());
--- a/main/protocols/protocol.h
+++ b/main/protocols/protocol.h
@ -8,6 +8,8 @@
 #include <vector>

 struct AudioStreamPacket {
+    int sample_rate = 0;
+    int frame_duration = 0;
    uint32_t timestamp = 0;
    std::vector<uint8_t> payload;
 };
--- a/main/protocols/websocket_protocol.cc
+++ b/main/protocols/websocket_protocol.cc
@ -124,6 +124,8 @@ bool WebsocketProtocol::OpenAudioChannel() {
                    bp2->payload_size = ntohl(bp2->payload_size);
                    auto payload = (uint8_t*)bp2->payload;
                    on_incoming_audio_(AudioStreamPacket{
+                        .sample_rate = server_sample_rate_,
+                        .frame_duration = server_frame_duration_,
                        .timestamp = bp2->timestamp,
                        .payload = std::vector<uint8_t>(payload, payload + bp2->payload_size)
                    });
@ -133,11 +135,15 @@ bool WebsocketProtocol::OpenAudioChannel() {
                    bp3->payload_size = ntohs(bp3->payload_size);
                    auto payload = (uint8_t*)bp3->payload;
                    on_incoming_audio_(AudioStreamPacket{
+                        .sample_rate = server_sample_rate_,
+                        .frame_duration = server_frame_duration_,
                        .timestamp = 0,
                        .payload = std::vector<uint8_t>(payload, payload + bp3->payload_size)
                    });
                } else {
                    on_incoming_audio_(AudioStreamPacket{
+                        .sample_rate = server_sample_rate_,
+                        .frame_duration = server_frame_duration_,
                        .timestamp = 0,
                        .payload = std::vector<uint8_t>((uint8_t*)data, (uint8_t*)data + len)
                    });
--- a/sdkconfig.defaults.esp32c3
+++ b/sdkconfig.defaults.esp32c3
@ -1,2 +1,3 @@

 CONFIG_ESPTOOLPY_FLASHSIZE_16MB=y
+CONFIG_SR_WN_WN9S_NIHAOXIAOZHI=y