ES8311 + ES7210

2024-10-24 09:53:08 +08:00 · 2024-10-24 09:53:08 +08:00 · a2487f46c8
commit a2487f46c8
parent 4c6da771ec
17 changed files with 3306 additions and 165 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -4,7 +4,7 @@
 # CMakeLists in this exact order for cmake to work correctly
 cmake_minimum_required(VERSION 3.16)
-set(PROJECT_VER "0.3.3")
+set(PROJECT_VER "0.4.0")
 include($ENV{IDF_PATH}/tools/cmake/project.cmake)
 project(xiaozhi)
--- a/main/Application.cc
+++ b/main/Application.cc
@ -18,25 +18,28 @@
 Application::Application()
    : boot_button_((gpio_num_t)CONFIG_BOOT_BUTTON_GPIO),
-      volume_up_button_((gpio_num_t)CONFIG_VOLUME_UP_BUTTON_GPIO)
+      volume_up_button_((gpio_num_t)CONFIG_VOLUME_UP_BUTTON_GPIO),
-#ifdef CONFIG_USE_ML307
+      volume_down_button_((gpio_num_t)CONFIG_VOLUME_DOWN_BUTTON_GPIO),
    , ml307_at_modem_(CONFIG_ML307_TX_PIN, CONFIG_ML307_RX_PIN, 4096),
      http_(ml307_at_modem_),
      firmware_upgrade_(http_)
 #else
    , http_(),
    firmware_upgrade_(http_)
 #endif
 #ifdef CONFIG_USE_DISPLAY
-    , display_(CONFIG_DISPLAY_SDA_PIN, CONFIG_DISPLAY_SCL_PIN)
+      display_(CONFIG_DISPLAY_SDA_PIN, CONFIG_DISPLAY_SCL_PIN),
 #endif
 #ifdef CONFIG_USE_ML307
      ml307_at_modem_(CONFIG_ML307_TX_PIN, CONFIG_ML307_RX_PIN, 4096),
      http_(ml307_at_modem_),
 #else
      http_(),
 #endif
      firmware_upgrade_(http_)
 {
    event_group_ = xEventGroupCreate();
-    opus_encoder_.Configure(CONFIG_AUDIO_INPUT_SAMPLE_RATE, 1);
+    opus_encoder_.Configure(16000, 1);
    opus_decoder_ = opus_decoder_create(opus_decode_sample_rate_, 1, NULL);
    if (opus_decode_sample_rate_ != CONFIG_AUDIO_OUTPUT_SAMPLE_RATE) {
-        opus_resampler_.Configure(opus_decode_sample_rate_, CONFIG_AUDIO_OUTPUT_SAMPLE_RATE);
+        output_resampler_.Configure(CONFIG_AUDIO_OUTPUT_SAMPLE_RATE, opus_decode_sample_rate_);
    }
    if (16000 != CONFIG_AUDIO_INPUT_SAMPLE_RATE) {
        input_resampler_.Configure(CONFIG_AUDIO_INPUT_SAMPLE_RATE, 16000);
    }
    firmware_upgrade_.SetCheckVersionUrl(CONFIG_OTA_VERSION_URL);
@ -185,29 +188,49 @@ void Application::Start() {
    }
 #endif
-    audio_device_.OnInputData([this](const int16_t* data, int size) {
+    audio_device_.Initialize();
    audio_device_.OnInputData([this](std::vector<int16_t>&& data) {
        if (16000 != CONFIG_AUDIO_INPUT_SAMPLE_RATE) {
            if (audio_device_.input_channels() == 2) {
                auto left_channel = std::vector<int16_t>(data.size() / 2);
                auto right_channel = std::vector<int16_t>(data.size() / 2);
                for (size_t i = 0, j = 0; i < left_channel.size(); ++i, j += 2) {
                    left_channel[i] = data[j];
                    right_channel[i] = data[j + 1];
                }
                auto resampled_left = std::vector<int16_t>(input_resampler_.GetOutputSamples(left_channel.size()));
                auto resampled_right = std::vector<int16_t>(input_resampler_.GetOutputSamples(right_channel.size()));
                input_resampler_.Process(left_channel.data(), left_channel.size(), resampled_left.data());
                input_resampler_.Process(right_channel.data(), right_channel.size(), resampled_right.data());
                data.resize(resampled_left.size() + resampled_right.size());
                for (size_t i = 0, j = 0; i < resampled_left.size(); ++i, j += 2) {
                    data[j] = resampled_left[i];
                    data[j + 1] = resampled_right[i];
                }
            } else {
                auto resampled = std::vector<int16_t>(input_resampler_.GetOutputSamples(data.size()));
                input_resampler_.Process(data.data(), data.size(), resampled.data());
                data = std::move(resampled);
            }
        }
 #ifdef CONFIG_USE_AFE_SR
        if (audio_processor_.IsRunning()) {
-            audio_processor_.Input(data, size);
+            audio_processor_.Input(data);
        }
        if (wake_word_detect_.IsDetectionRunning()) {
-            wake_word_detect_.Feed(data, size);
+            wake_word_detect_.Feed(data);
        }
 #else
-        std::vector<int16_t> pcm(data, data + size);
+        Schedule([this, data = std::move(data)]() {
        Schedule([this, pcm = std::move(pcm)]() {
            if (chat_state_ == kChatStateListening) {
                std::lock_guard<std::mutex> lock(mutex_);
-                audio_encode_queue_.emplace_back(std::move(pcm));
+                audio_encode_queue_.emplace_back(std::move(data));
                cv_.notify_all();
            }
        });
 #endif
    });
    // Initialize the audio device
    audio_device_.Start(CONFIG_AUDIO_INPUT_SAMPLE_RATE, CONFIG_AUDIO_OUTPUT_SAMPLE_RATE);
    // OPUS encoder / decoder use a lot of stack memory
    const size_t opus_stack_size = 4096 * 8;
    audio_encode_task_stack_ = (StackType_t*)malloc(opus_stack_size);
@ -221,9 +244,10 @@ void Application::Start() {
        Application* app = (Application*)arg;
        app->AudioPlayTask();
        vTaskDelete(NULL);
-    }, "play_audio", 4096 * 4, this, 5, NULL);
+    }, "play_audio", 4096 * 4, this, 4, NULL);
 #ifdef CONFIG_USE_AFE_SR
    wake_word_detect_.Initialize(audio_device_.input_channels(), audio_device_.input_reference());
    wake_word_detect_.OnVadStateChange([this](bool speaking) {
        Schedule([this, speaking]() {
            auto& builtin_led = BuiltinLed::GetInstance();
@ -272,6 +296,7 @@ void Application::Start() {
    });
    wake_word_detect_.StartDetection();
    audio_processor_.Initialize(audio_device_.input_channels(), audio_device_.input_reference());
    audio_processor_.OnOutput([this](std::vector<int16_t>&& data) {
        Schedule([this, data = std::move(data)]() {
            if (chat_state_ == kChatStateListening) {
@ -317,7 +342,7 @@ void Application::Start() {
        Schedule([this]() {
            auto volume = audio_device_.output_volume() + 10;
            if (volume > 100) {
-                volume = 0;
+                volume = 100;
            }
            audio_device_.SetOutputVolume(volume);
 #ifdef CONFIG_USE_DISPLAY
@ -327,6 +352,28 @@ void Application::Start() {
    });
    volume_up_button_.OnLongPress([this]() {
        Schedule([this]() {
            audio_device_.SetOutputVolume(100);
 #ifdef CONFIG_USE_DISPLAY
            display_.ShowNotification("Volume\n100");
 #endif
        });
    });
    volume_down_button_.OnClick([this]() {
        Schedule([this]() {
            auto volume = audio_device_.output_volume() - 10;
            if (volume < 0) {
                volume = 0;
            }
            audio_device_.SetOutputVolume(volume);
 #ifdef CONFIG_USE_DISPLAY
            display_.ShowNotification("Volume\n" + std::to_string(volume));
 #endif
        });
    });
    volume_down_button_.OnLongPress([this]() {
        Schedule([this]() {
            audio_device_.SetOutputVolume(0);
 #ifdef CONFIG_USE_DISPLAY
@ -449,10 +496,12 @@ BinaryProtocol* Application::AllocateBinaryProtocol(const uint8_t* payload, size
 void Application::AudioEncodeTask() {
    ESP_LOGI(TAG, "Audio encode task started");
    const int max_audio_play_queue_size_ = 2;
    while (true) {
        std::unique_lock<std::mutex> lock(mutex_);
        cv_.wait(lock, [this]() {
-            return !audio_encode_queue_.empty() || !audio_decode_queue_.empty();
+            return !audio_encode_queue_.empty() || (!audio_decode_queue_.empty() && audio_play_queue_.size() < max_audio_play_queue_size_);
        });
        if (!audio_encode_queue_.empty()) {
@ -488,9 +537,9 @@ void Application::AudioEncodeTask() {
            }
            if (opus_decode_sample_rate_ != CONFIG_AUDIO_OUTPUT_SAMPLE_RATE) {
-                int target_size = opus_resampler_.GetOutputSamples(frame_size);
+                int target_size = output_resampler_.GetOutputSamples(frame_size);
                std::vector<int16_t> resampled(target_size);
-                opus_resampler_.Process(packet->pcm.data(), frame_size, resampled.data());
+                output_resampler_.Process(packet->pcm.data(), frame_size, resampled.data());
                packet->pcm = std::move(resampled);
            }
@ -513,7 +562,6 @@ void Application::HandleAudioPacket(AudioPacket* packet) {
        audio_device_.OutputData(packet->pcm);
        if (break_speaking_) {
            break_speaking_ = false;
            skip_to_end_ = true;
            // Play a silence and skip to the end
@ -525,12 +573,13 @@ void Application::HandleAudioPacket(AudioPacket* packet) {
        break;
    }
    case kAudioPacketTypeStart:
        break_speaking_ = false;
        skip_to_end_ = false;
        Schedule([this]() {
            SetChatState(kChatStateSpeaking);
        });
        break;
    case kAudioPacketTypeStop:
        skip_to_end_ = false;
        Schedule([this]() {
            SetChatState(kChatStateListening);
        });
@ -558,6 +607,7 @@ void Application::AudioPlayTask() {
        });
        auto packet = std::move(audio_play_queue_.front());
        audio_play_queue_.pop_front();
        cv_.notify_all();
        lock.unlock();
        HandleAudioPacket(packet);
@ -574,7 +624,7 @@ void Application::SetDecodeSampleRate(int sample_rate) {
    opus_decoder_ = opus_decoder_create(opus_decode_sample_rate_, 1, NULL);
    if (opus_decode_sample_rate_ != CONFIG_AUDIO_OUTPUT_SAMPLE_RATE) {
        ESP_LOGI(TAG, "Resampling audio from %d to %d", opus_decode_sample_rate_, CONFIG_AUDIO_OUTPUT_SAMPLE_RATE);
-        opus_resampler_.Configure(opus_decode_sample_rate_, CONFIG_AUDIO_OUTPUT_SAMPLE_RATE);
+        output_resampler_.Configure(opus_decode_sample_rate_, CONFIG_AUDIO_OUTPUT_SAMPLE_RATE);
    }
 }
@ -607,7 +657,7 @@ void Application::StartWebSocketClient() {
        std::string message = "{";
        message += "\"type\":\"hello\",";
        message += "\"audio_params\":{";
-        message += "\"format\":\"opus\", \"sample_rate\":" + std::to_string(CONFIG_AUDIO_INPUT_SAMPLE_RATE) + ", \"channels\":1";
+        message += "\"format\":\"opus\", \"sample_rate\":16000, \"channels\":1";
        message += "}}";
        ws_client_->Send(message);
    });
@ -640,6 +690,10 @@ void Application::StartWebSocketClient() {
                        if (sample_rate != NULL) {
                            SetDecodeSampleRate(sample_rate->valueint);
                        }
                        // If the device is speaking, we need to break the speaking
                        break_speaking_ = true;
                        skip_to_end_ = true;
                    } else if (strcmp(state->valuestring, "stop") == 0) {
                        packet->type = kAudioPacketTypeStop;
                    } else if (strcmp(state->valuestring, "sentence_end") == 0) {
--- a/main/Application.h
+++ b/main/Application.h
@ -1,7 +1,6 @@
 #ifndef _APPLICATION_H_
 #define _APPLICATION_H_
 #include "AudioDevice.h"
 #include <OpusEncoder.h>
 #include <OpusResampler.h>
 #include <WebSocket.h>
@ -17,6 +16,7 @@
 #include <list>
 #include <condition_variable>
 #include "BoxAudioDevice.h"
 #include "Display.h"
 #include "FirmwareUpgrade.h"
@ -86,7 +86,15 @@ private:
    Button boot_button_;
    Button volume_up_button_;
    Button volume_down_button_;
 #ifdef CONFIG_AUDIO_CODEC_ES8311_ES7210
    BoxAudioDevice audio_device_;
 #else
    AudioDevice audio_device_;
 #endif
 #ifdef CONFIG_USE_DISPLAY
    Display display_;
 #endif
 #ifdef CONFIG_USE_AFE_SR
    WakeWordDetect wake_word_detect_;
    AudioProcessor audio_processor_;
@ -98,9 +106,6 @@ private:
    EspHttp http_;
 #endif
    FirmwareUpgrade firmware_upgrade_;
 #ifdef CONFIG_USE_DISPLAY
    Display display_;
 #endif
    std::mutex mutex_;
    std::condition_variable_any cv_;
    std::list<std::function<void()>> main_tasks_;
@ -123,7 +128,8 @@ private:
    int opus_duration_ms_ = 60;
    int opus_decode_sample_rate_ = CONFIG_AUDIO_OUTPUT_SAMPLE_RATE;
-    OpusResampler opus_resampler_;
+    OpusResampler input_resampler_;
    OpusResampler output_resampler_;
    TaskHandle_t check_new_version_task_ = nullptr;
    StaticTask_t check_new_version_task_buffer_;
--- a/main/AudioDevice.cc
+++ b/main/AudioDevice.cc
@ -4,7 +4,9 @@
 #include <cmath>
 #define TAG "AudioDevice"
-AudioDevice::AudioDevice() {
+AudioDevice::AudioDevice()
    : input_sample_rate_(CONFIG_AUDIO_INPUT_SAMPLE_RATE),
      output_sample_rate_(CONFIG_AUDIO_OUTPUT_SAMPLE_RATE) {
 }
 AudioDevice::~AudioDevice() {
@ -19,26 +21,16 @@ AudioDevice::~AudioDevice() {
    }
 }
-void AudioDevice::Start(int input_sample_rate, int output_sample_rate) {
+void AudioDevice::Initialize() {
-    input_sample_rate_ = input_sample_rate;
+#ifdef CONFIG_AUDIO_I2S_METHOD_SIMPLEX
    output_sample_rate_ = output_sample_rate;
 #ifdef CONFIG_AUDIO_DEVICE_I2S_SIMPLEX
    CreateSimplexChannels();
 #else
    CreateDuplexChannels();
 #endif
    ESP_ERROR_CHECK(i2s_channel_enable(tx_handle_));
    ESP_ERROR_CHECK(i2s_channel_enable(rx_handle_));
    xTaskCreate([](void* arg) {
        auto audio_device = (AudioDevice*)arg;
        audio_device->InputTask();
    }, "audio_input", 4096 * 2, this, 5, &audio_input_task_);
 }
 void AudioDevice::CreateDuplexChannels() {
 #ifdef CONFIG_AUDIO_I2S_METHOD_DUPLEX
    duplex_ = true;
    i2s_chan_config_t chan_cfg = {
@ -73,10 +65,10 @@ void AudioDevice::CreateDuplexChannels() {
        },
        .gpio_cfg = {
            .mclk = I2S_GPIO_UNUSED,
-            .bclk = (gpio_num_t)CONFIG_AUDIO_DEVICE_I2S_MIC_GPIO_BCLK,
+            .bclk = (gpio_num_t)CONFIG_AUDIO_DEVICE_I2S_GPIO_BCLK,
-            .ws = (gpio_num_t)CONFIG_AUDIO_DEVICE_I2S_MIC_GPIO_WS,
+            .ws = (gpio_num_t)CONFIG_AUDIO_DEVICE_I2S_GPIO_LRCK,
-            .dout = (gpio_num_t)CONFIG_AUDIO_DEVICE_I2S_SPK_GPIO_DOUT,
+            .dout = (gpio_num_t)CONFIG_AUDIO_DEVICE_I2S_GPIO_DOUT,
-            .din = (gpio_num_t)CONFIG_AUDIO_DEVICE_I2S_MIC_GPIO_DIN,
+            .din = (gpio_num_t)CONFIG_AUDIO_DEVICE_I2S_GPIO_DIN,
            .invert_flags = {
                .mclk_inv = false,
                .bclk_inv = false,
@ -86,11 +78,14 @@ void AudioDevice::CreateDuplexChannels() {
    };
    ESP_ERROR_CHECK(i2s_channel_init_std_mode(tx_handle_, &std_cfg));
    ESP_ERROR_CHECK(i2s_channel_init_std_mode(rx_handle_, &std_cfg));
    ESP_ERROR_CHECK(i2s_channel_enable(tx_handle_));
    ESP_ERROR_CHECK(i2s_channel_enable(rx_handle_));
    ESP_LOGI(TAG, "Duplex channels created");
 #endif
 }
 #ifdef CONFIG_AUDIO_DEVICE_I2S_SIMPLEX
 void AudioDevice::CreateSimplexChannels() {
 #ifdef CONFIG_AUDIO_I2S_METHOD_SIMPLEX
    // Create a new channel for speaker
    i2s_chan_config_t chan_cfg = {
        .id = I2S_NUM_0,
@ -125,7 +120,7 @@ void AudioDevice::CreateSimplexChannels() {
        .gpio_cfg = {
            .mclk = I2S_GPIO_UNUSED,
            .bclk = (gpio_num_t)CONFIG_AUDIO_DEVICE_I2S_SPK_GPIO_BCLK,
-            .ws = (gpio_num_t)CONFIG_AUDIO_DEVICE_I2S_SPK_GPIO_WS,
+            .ws = (gpio_num_t)CONFIG_AUDIO_DEVICE_I2S_SPK_GPIO_LRCK,
            .dout = (gpio_num_t)CONFIG_AUDIO_DEVICE_I2S_SPK_GPIO_DOUT,
            .din = I2S_GPIO_UNUSED,
            .invert_flags = {
@ -141,16 +136,19 @@ void AudioDevice::CreateSimplexChannels() {
    chan_cfg.id = I2S_NUM_1;
    ESP_ERROR_CHECK(i2s_new_channel(&chan_cfg, nullptr, &rx_handle_));
    std_cfg.clk_cfg.sample_rate_hz = (uint32_t)input_sample_rate_;
-    std_cfg.gpio_cfg.bclk = (gpio_num_t)CONFIG_AUDIO_DEVICE_I2S_MIC_GPIO_BCLK;
+    std_cfg.gpio_cfg.bclk = (gpio_num_t)CONFIG_AUDIO_DEVICE_I2S_MIC_GPIO_SCK;
    std_cfg.gpio_cfg.ws = (gpio_num_t)CONFIG_AUDIO_DEVICE_I2S_MIC_GPIO_WS;
    std_cfg.gpio_cfg.dout = I2S_GPIO_UNUSED;
    std_cfg.gpio_cfg.din = (gpio_num_t)CONFIG_AUDIO_DEVICE_I2S_MIC_GPIO_DIN;
    ESP_ERROR_CHECK(i2s_channel_init_std_mode(rx_handle_, &std_cfg));
    ESP_LOGI(TAG, "Simplex channels created");
 }
 #endif
-void AudioDevice::Write(const int16_t* data, int samples) {
+    ESP_ERROR_CHECK(i2s_channel_enable(tx_handle_));
    ESP_ERROR_CHECK(i2s_channel_enable(rx_handle_));
    ESP_LOGI(TAG, "Simplex channels created");
 #endif
 }
 int AudioDevice::Write(const int16_t* data, int samples) {
    int32_t buffer[samples];
    // output_volume_: 0-100
@ -162,6 +160,7 @@ void AudioDevice::Write(const int16_t* data, int samples) {
    size_t bytes_written;
    ESP_ERROR_CHECK(i2s_channel_write(tx_handle_, buffer, samples * sizeof(int32_t), &bytes_written, portMAX_DELAY));
    return bytes_written / sizeof(int32_t);
 }
 int AudioDevice::Read(int16_t* dest, int samples) {
@ -181,8 +180,16 @@ int AudioDevice::Read(int16_t* dest, int samples) {
    return samples;
 }
-void AudioDevice::OnInputData(std::function<void(const int16_t*, int)> callback) {
+void AudioDevice::OnInputData(std::function<void(std::vector<int16_t>&& data)> callback) {
    on_input_data_ = callback;
    // 创建音频输入任务
    if (audio_input_task_ == nullptr) {
        xTaskCreate([](void* arg) {
            auto audio_device = (AudioDevice*)arg;
            audio_device->InputTask();
        }, "audio_input", 4096 * 2, this, 3, &audio_input_task_);
    }
 }
 void AudioDevice::OutputData(std::vector<int16_t>& data) {
@ -191,12 +198,14 @@ void AudioDevice::OutputData(std::vector<int16_t>& data) {
 void AudioDevice::InputTask() {
    int duration = 30;
-    int input_frame_size = input_sample_rate_ / 1000 * duration;
+    int input_frame_size = input_sample_rate_ / 1000 * duration * input_channels_;
    int16_t input_buffer[input_frame_size];
    while (true) {
-        int samples = Read(input_buffer, input_frame_size);
+        std::vector<int16_t> input_data(input_frame_size);
        int samples = Read(input_data.data(), input_data.size());
        if (samples > 0) {
-            on_input_data_(input_buffer, samples);
+            if (on_input_data_) {
                on_input_data_(std::move(input_data));
            }
        }
    }
 }
--- a/main/AudioDevice.h
+++ b/main/AudioDevice.h
@ -2,7 +2,6 @@
 #define _AUDIO_DEVICE_H
 #include <freertos/FreeRTOS.h>
 #include <freertos/event_groups.h>
 #include <driver/i2s_std.h>
 #include <vector>
@ -12,35 +11,42 @@
 class AudioDevice {
 public:
    AudioDevice();
-    ~AudioDevice();
+    virtual ~AudioDevice();
    virtual void Initialize();
-    void Start(int input_sample_rate, int output_sample_rate);
+    void OnInputData(std::function<void(std::vector<int16_t>&& data)> callback);
    void OnInputData(std::function<void(const int16_t*, int)> callback);
    void OutputData(std::vector<int16_t>& data);
-    void SetOutputVolume(int volume);
+    virtual void SetOutputVolume(int volume);
    inline bool duplex() const { return duplex_; }
    inline bool input_reference() const { return input_reference_; }
    inline int input_sample_rate() const { return input_sample_rate_; }
    inline int output_sample_rate() const { return output_sample_rate_; }
    inline int input_channels() const { return input_channels_; }
    inline int output_channels() const { return output_channels_; }
    inline int output_volume() const { return output_volume_; }
    int input_sample_rate() const { return input_sample_rate_; }
    int output_sample_rate() const { return output_sample_rate_; }
    bool duplex() const { return duplex_; }
    int output_volume() const { return output_volume_; }
 private:
    TaskHandle_t audio_input_task_ = nullptr;
    std::function<void(std::vector<int16_t>&& data)> on_input_data_; 
    void InputTask();
    void CreateSimplexChannels();
 protected:
    bool duplex_ = false;
    bool input_reference_ = false;
    int input_sample_rate_ = 0;
    int output_sample_rate_ = 0;
-    int output_volume_ = 80;
+    int input_channels_ = 1;
    int output_channels_ = 1;
    int output_volume_ = 70;
    i2s_chan_handle_t tx_handle_ = nullptr;
    i2s_chan_handle_t rx_handle_ = nullptr;
-    TaskHandle_t audio_input_task_ = nullptr;
+    virtual void CreateDuplexChannels();
-    
+    virtual int Read(int16_t* dest, int samples);
-    EventGroupHandle_t event_group_;
+    virtual int Write(const int16_t* data, int samples);
    std::function<void(const int16_t*, int)> on_input_data_;
    void CreateDuplexChannels();
    void CreateSimplexChannels();
    void InputTask();
    int Read(int16_t* dest, int samples);
    void Write(const int16_t* data, int samples);
 };
 #endif // _AUDIO_DEVICE_H
--- a/main/AudioProcessor.cc
+++ b/main/AudioProcessor.cc
@ -8,6 +8,12 @@ static const char* TAG = "AudioProcessor";
 AudioProcessor::AudioProcessor()
    : afe_communication_data_(nullptr) {
    event_group_ = xEventGroupCreate();
 }
 void AudioProcessor::Initialize(int channels, bool reference) {
    channels_ = channels;
    reference_ = reference;
    int ref_num = reference_ ? 1 : 0;
    afe_config_t afe_config = {
        .aec_init = false,
@ -22,17 +28,17 @@ AudioProcessor::AudioProcessor()
        .wakenet_model_name_2 = NULL,
        .wakenet_mode = DET_MODE_90,
        .afe_mode = SR_MODE_HIGH_PERF,
-        .afe_perferred_core = 0,
+        .afe_perferred_core = 1,
-        .afe_perferred_priority = 5,
+        .afe_perferred_priority = 1,
        .afe_ringbuf_size = 50,
        .memory_alloc_mode = AFE_MEMORY_ALLOC_MORE_PSRAM,
        .afe_linear_gain = 1.0,
        .agc_mode = AFE_MN_PEAK_AGC_MODE_2,
        .pcm_config = {
-            .total_ch_num = 1,
+            .total_ch_num = channels_,
-            .mic_num = 1,
+            .mic_num = channels_ - ref_num,
-            .ref_num = 0,
+            .ref_num = ref_num,
-            .sample_rate = CONFIG_AUDIO_INPUT_SAMPLE_RATE,
+            .sample_rate = 16000,
        },
        .debug_init = false,
        .debug_hook = {{ AFE_DEBUG_HOOK_MASE_TASK_IN, NULL }, { AFE_DEBUG_HOOK_FETCH_TASK_IN, NULL }},
@ -47,7 +53,7 @@ AudioProcessor::AudioProcessor()
        auto this_ = (AudioProcessor*)arg;
        this_->AudioProcessorTask();
        vTaskDelete(NULL);
-    }, "audio_communication", 4096 * 2, this, 5, NULL);
+    }, "audio_communication", 4096 * 2, this, 1, NULL);
 }
 AudioProcessor::~AudioProcessor() {
@ -57,10 +63,10 @@ AudioProcessor::~AudioProcessor() {
    vEventGroupDelete(event_group_);
 }
-void AudioProcessor::Input(const int16_t* data, int size) {
+void AudioProcessor::Input(std::vector<int16_t>& data) {
-    input_buffer_.insert(input_buffer_.end(), data, data + size);
+    input_buffer_.insert(input_buffer_.end(), data.begin(), data.end());
-    auto chunk_size = esp_afe_vc_v1.get_feed_chunksize(afe_communication_data_);
+    auto chunk_size = esp_afe_vc_v1.get_feed_chunksize(afe_communication_data_) * channels_;
    while (input_buffer_.size() >= chunk_size) {
        auto chunk = input_buffer_.data();
        esp_afe_vc_v1.feed(afe_communication_data_, chunk);
@ -92,6 +98,9 @@ void AudioProcessor::AudioProcessorTask() {
        xEventGroupWaitBits(event_group_, PROCESSOR_RUNNING, pdFALSE, pdTRUE, portMAX_DELAY);
        auto res = esp_afe_vc_v1.fetch(afe_communication_data_);
        if ((xEventGroupGetBits(event_group_) & PROCESSOR_RUNNING) == 0) {
            continue;
        }
        if (res == nullptr || res->ret_value == ESP_FAIL) {
            if (res != nullptr) {
                ESP_LOGI(TAG, "Error code: %d", res->ret_value);
--- a/main/AudioProcessor.h
+++ b/main/AudioProcessor.h
@ -15,7 +15,8 @@ public:
    AudioProcessor();
    ~AudioProcessor();
-    void Input(const int16_t* data, int size);
+    void Initialize(int channels, bool reference);
    void Input(std::vector<int16_t>& data);
    void Start();
    void Stop();
    bool IsRunning();
@ -26,6 +27,8 @@ private:
    esp_afe_sr_data_t* afe_communication_data_ = nullptr;
    std::vector<int16_t> input_buffer_;
    std::function<void(std::vector<int16_t>&& data)> output_callback_;
    int channels_;
    bool reference_;
    void AudioProcessorTask();
 };
--- a/main/BoxAudioDevice.cc
+++ b/main/BoxAudioDevice.cc
@ -0,0 +1,232 @@
 #include "BoxAudioDevice.h"
 #include <esp_log.h>
 #include <cassert>
 static const char* TAG = "BoxAudioDevice";
 BoxAudioDevice::BoxAudioDevice() {
 }
 BoxAudioDevice::~BoxAudioDevice() {
    ESP_ERROR_CHECK(esp_codec_dev_close(output_dev_));
    esp_codec_dev_delete(output_dev_);
    ESP_ERROR_CHECK(esp_codec_dev_close(input_dev_));
    esp_codec_dev_delete(input_dev_);
    audio_codec_delete_codec_if(in_codec_if_);
    audio_codec_delete_ctrl_if(in_ctrl_if_);
    audio_codec_delete_codec_if(out_codec_if_);
    audio_codec_delete_ctrl_if(out_ctrl_if_);
    audio_codec_delete_gpio_if(gpio_if_);
    audio_codec_delete_data_if(data_if_);
    ESP_ERROR_CHECK(i2c_del_master_bus(i2c_master_handle_));
 }
 void BoxAudioDevice::Initialize() {
    duplex_ = true; // 是否双工
    input_reference_ = CONFIG_AUDIO_CODEC_INPUT_REFERENCE; // 是否使用参考输入，实现回声消除
    input_channels_ = input_reference_ ? 2 : 1; // 输入通道数
    // Initialize I2C peripheral
    i2c_master_bus_config_t i2c_bus_cfg = {
        .i2c_port = I2C_NUM_0,
        .sda_io_num = (gpio_num_t)CONFIG_AUDIO_CODEC_I2C_SDA_PIN,
        .scl_io_num = (gpio_num_t)CONFIG_AUDIO_CODEC_I2C_SCL_PIN,
        .clk_source = I2C_CLK_SRC_DEFAULT,
        .glitch_ignore_cnt = 7,
        .intr_priority = 0,
        .trans_queue_depth = 0,
        .flags = {
            .enable_internal_pullup = 1,
        },
    };
    ESP_ERROR_CHECK(i2c_new_master_bus(&i2c_bus_cfg, &i2c_master_handle_));
    CreateDuplexChannels();
    // Do initialize of related interface: data_if, ctrl_if and gpio_if
    audio_codec_i2s_cfg_t i2s_cfg = {
        .port = I2S_NUM_0,
        .rx_handle = rx_handle_,
        .tx_handle = tx_handle_,
    };
    data_if_ = audio_codec_new_i2s_data(&i2s_cfg);
    assert(data_if_ != NULL);
    // Output
    audio_codec_i2c_cfg_t i2c_cfg = {
        .port = I2C_NUM_0,
        .addr = ES8311_CODEC_DEFAULT_ADDR,
        .bus_handle = i2c_master_handle_,
    };
    out_ctrl_if_ = audio_codec_new_i2c_ctrl(&i2c_cfg);
    assert(out_ctrl_if_ != NULL);
    gpio_if_ = audio_codec_new_gpio();
    assert(gpio_if_ != NULL);
    es8311_codec_cfg_t es8311_cfg = {};
    es8311_cfg.ctrl_if = out_ctrl_if_;
    es8311_cfg.gpio_if = gpio_if_;
    es8311_cfg.codec_mode = ESP_CODEC_DEV_WORK_MODE_DAC;
    es8311_cfg.pa_pin = CONFIG_AUDIO_CODEC_PA_PIN;
    es8311_cfg.use_mclk = true;
    es8311_cfg.hw_gain.pa_voltage = 5.0;
    es8311_cfg.hw_gain.codec_dac_voltage = 3.3;
    out_codec_if_ = es8311_codec_new(&es8311_cfg);
    assert(out_codec_if_ != NULL);
    esp_codec_dev_cfg_t dev_cfg = {
        .dev_type = ESP_CODEC_DEV_TYPE_OUT,
        .codec_if = out_codec_if_,
        .data_if = data_if_,
    };
    output_dev_ = esp_codec_dev_new(&dev_cfg);
    assert(output_dev_ != NULL);
    ESP_ERROR_CHECK(esp_codec_dev_set_out_vol(output_dev_, output_volume_));
    // Play 16bit 1 channel
    esp_codec_dev_sample_info_t fs = {
        .bits_per_sample = 16,
        .channel = 1,
        .channel_mask = 0,
        .sample_rate = (uint32_t)output_sample_rate_,
        .mclk_multiple = 0,
    };
    ESP_ERROR_CHECK(esp_codec_dev_open(output_dev_, &fs));
    // Input
    i2c_cfg.addr = ES7210_CODEC_DEFAULT_ADDR;
    in_ctrl_if_ = audio_codec_new_i2c_ctrl(&i2c_cfg);
    assert(in_ctrl_if_ != NULL);
    es7210_codec_cfg_t es7210_cfg = {};
    es7210_cfg.ctrl_if = in_ctrl_if_;
    es7210_cfg.mic_selected = ES7120_SEL_MIC1 | ES7120_SEL_MIC2 | ES7120_SEL_MIC3 | ES7120_SEL_MIC4;
    in_codec_if_ = es7210_codec_new(&es7210_cfg);
    assert(in_codec_if_ != NULL);
    dev_cfg.dev_type = ESP_CODEC_DEV_TYPE_IN;
    dev_cfg.codec_if = in_codec_if_;
    input_dev_ = esp_codec_dev_new(&dev_cfg);
    assert(input_dev_ != NULL);
    fs.channel = 4;
    if (input_channels_ == 1) {
        fs.channel_mask = ESP_CODEC_DEV_MAKE_CHANNEL_MASK(0);
    } else {
        fs.channel_mask = ESP_CODEC_DEV_MAKE_CHANNEL_MASK(0) | ESP_CODEC_DEV_MAKE_CHANNEL_MASK(1);
    }
    ESP_ERROR_CHECK(esp_codec_dev_open(input_dev_, &fs));
    ESP_ERROR_CHECK(esp_codec_dev_set_in_channel_gain(input_dev_, ESP_CODEC_DEV_MAKE_CHANNEL_MASK(0), 30.0));
    ESP_LOGI(TAG, "BoxAudioDevice initialized");
 }
 void BoxAudioDevice::CreateDuplexChannels() {
    assert(input_sample_rate_ == output_sample_rate_);
    i2s_chan_config_t chan_cfg = {
        .id = I2S_NUM_0,
        .role = I2S_ROLE_MASTER,
        .dma_desc_num = 6,
        .dma_frame_num = 240,
        .auto_clear_after_cb = true,
        .auto_clear_before_cb = false,
        .intr_priority = 0,
    };
    ESP_ERROR_CHECK(i2s_new_channel(&chan_cfg, &tx_handle_, &rx_handle_));
    i2s_std_config_t std_cfg = {
        .clk_cfg = {
            .sample_rate_hz = (uint32_t)output_sample_rate_,
            .clk_src = I2S_CLK_SRC_DEFAULT,
            .ext_clk_freq_hz = 0,
            .mclk_multiple = I2S_MCLK_MULTIPLE_256
        },
        .slot_cfg = {
            .data_bit_width = I2S_DATA_BIT_WIDTH_16BIT,
            .slot_bit_width = I2S_SLOT_BIT_WIDTH_AUTO,
            .slot_mode = I2S_SLOT_MODE_STEREO,
            .slot_mask = I2S_STD_SLOT_BOTH,
            .ws_width = I2S_DATA_BIT_WIDTH_16BIT,
            .ws_pol = false,
            .bit_shift = true,
            .left_align = true,
            .big_endian = false,
            .bit_order_lsb = false
        },
        .gpio_cfg = {
            .mclk = (gpio_num_t)CONFIG_AUDIO_DEVICE_I2S_GPIO_MCLK,
            .bclk = (gpio_num_t)CONFIG_AUDIO_DEVICE_I2S_GPIO_BCLK,
            .ws = (gpio_num_t)CONFIG_AUDIO_DEVICE_I2S_GPIO_LRCK,
            .dout = (gpio_num_t)CONFIG_AUDIO_DEVICE_I2S_GPIO_DOUT,
            .din = I2S_GPIO_UNUSED,
            .invert_flags = {
                .mclk_inv = false,
                .bclk_inv = false,
                .ws_inv = false
            }
        }
    };
    i2s_tdm_config_t tdm_cfg = {
        .clk_cfg = {
            .sample_rate_hz = (uint32_t)input_sample_rate_,
            .clk_src = I2S_CLK_SRC_DEFAULT,
            .ext_clk_freq_hz = 0,
            .mclk_multiple = I2S_MCLK_MULTIPLE_256,
            .bclk_div = 8,
        },
        .slot_cfg = {
            .data_bit_width = I2S_DATA_BIT_WIDTH_16BIT,
            .slot_bit_width = I2S_SLOT_BIT_WIDTH_AUTO,
            .slot_mode = I2S_SLOT_MODE_STEREO,
            .slot_mask = i2s_tdm_slot_mask_t(I2S_TDM_SLOT0 | I2S_TDM_SLOT1 | I2S_TDM_SLOT2 | I2S_TDM_SLOT3),
            .ws_width = I2S_TDM_AUTO_WS_WIDTH,
            .ws_pol = false,
            .bit_shift = true,
            .left_align = false,
            .big_endian = false,
            .bit_order_lsb = false,
            .skip_mask = false,
            .total_slot = I2S_TDM_AUTO_SLOT_NUM
        },
        .gpio_cfg = {
            .mclk = (gpio_num_t)CONFIG_AUDIO_DEVICE_I2S_GPIO_MCLK,
            .bclk = (gpio_num_t)CONFIG_AUDIO_DEVICE_I2S_GPIO_BCLK,
            .ws = (gpio_num_t)CONFIG_AUDIO_DEVICE_I2S_GPIO_LRCK,
            .dout = I2S_GPIO_UNUSED,
            .din = (gpio_num_t)CONFIG_AUDIO_DEVICE_I2S_GPIO_DIN,
            .invert_flags = {
                .mclk_inv = false,
                .bclk_inv = false,
                .ws_inv = false
            }
        }
    };
    ESP_ERROR_CHECK(i2s_channel_init_std_mode(tx_handle_, &std_cfg));
    ESP_ERROR_CHECK(i2s_channel_init_tdm_mode(rx_handle_, &tdm_cfg));
    ESP_ERROR_CHECK(i2s_channel_enable(tx_handle_));
    ESP_ERROR_CHECK(i2s_channel_enable(rx_handle_));
    ESP_LOGI(TAG, "Duplex channels created");
 }
 int BoxAudioDevice::Read(int16_t *buffer, int samples) {
    ESP_ERROR_CHECK(esp_codec_dev_read(input_dev_, (void*)buffer, samples * sizeof(int16_t)));
    return samples;
 }
 int BoxAudioDevice::Write(const int16_t *buffer, int samples) {
    ESP_ERROR_CHECK(esp_codec_dev_write(output_dev_, (void*)buffer, samples * sizeof(int16_t)));
    return samples;
 }
 void BoxAudioDevice::SetOutputVolume(int volume) {
    ESP_ERROR_CHECK(esp_codec_dev_set_out_vol(output_dev_, volume));
    AudioDevice::SetOutputVolume(volume);
 }
--- a/main/BoxAudioDevice.h
+++ b/main/BoxAudioDevice.h
@ -0,0 +1,36 @@
 #ifndef _BOX_AUDIO_DEVICE_H
 #define _BOX_AUDIO_DEVICE_H
 #include "AudioDevice.h"
 #include <driver/i2c_master.h>
 #include <driver/i2s_tdm.h>
 #include <esp_codec_dev.h>
 #include <esp_codec_dev_defaults.h>
 class BoxAudioDevice : public AudioDevice {
 public:
    BoxAudioDevice();
    virtual ~BoxAudioDevice();
    void Initialize() override;
    void SetOutputVolume(int volume) override;
 private:
    i2c_master_bus_handle_t i2c_master_handle_ = nullptr;
    const audio_codec_data_if_t* data_if_ = nullptr;
    const audio_codec_ctrl_if_t* out_ctrl_if_ = nullptr;
    const audio_codec_if_t* out_codec_if_ = nullptr;
    const audio_codec_ctrl_if_t* in_ctrl_if_ = nullptr;
    const audio_codec_if_t* in_codec_if_ = nullptr;
    const audio_codec_gpio_if_t* gpio_if_ = nullptr;
    esp_codec_dev_handle_t output_dev_ = nullptr;
    esp_codec_dev_handle_t input_dev_ = nullptr;
    void CreateDuplexChannels() override;
    int Read(int16_t* dest, int samples) override;
    int Write(const int16_t* data, int samples) override;
 };
 #endif // _BOX_AUDIO_DEVICE_H
--- a/main/CMakeLists.txt
+++ b/main/CMakeLists.txt
@ -11,6 +11,9 @@ set(SOURCES "AudioDevice.cc"
 if(CONFIG_USE_AFE_SR)
    list(APPEND SOURCES "AudioProcessor.cc" "WakeWordDetect.cc")
 endif()
 if(CONFIG_AUDIO_CODEC_ES8311_ES7210)
    list(APPEND SOURCES "BoxAudioDevice.cc")
 endif()
 idf_component_register(SRCS ${SOURCES}
                    INCLUDE_DIRS "."
--- a/main/Display.cc
+++ b/main/Display.cc
@ -17,7 +17,7 @@ Display::Display(int sda_pin, int scl_pin) : sda_pin_(sda_pin), scl_pin_(scl_pin
    ESP_LOGI(TAG, "Display Pins: %d, %d", sda_pin_, scl_pin_);
    i2c_master_bus_config_t bus_config = {
-        .i2c_port = I2C_NUM_0,
+        .i2c_port = I2C_NUM_1,
        .sda_io_num = (gpio_num_t)sda_pin_,
        .scl_io_num = (gpio_num_t)scl_pin_,
        .clk_source = I2C_CLK_SRC_DEFAULT,
--- a/main/Kconfig.projbuild
+++ b/main/Kconfig.projbuild
@ -30,49 +30,136 @@ config AUDIO_OUTPUT_SAMPLE_RATE
    help
        Audio output sample rate.
 choice AUDIO_CODEC
    prompt "Audio Codec"
    default AUDIO_CODEC_NONE
    help
        Audio codec.
    config AUDIO_CODEC_ES8311_ES7210
        bool "Box: ES8311 + ES7210"
    config AUDIO_CODEC_NONE
        bool "None"
 endchoice
 menu "Box Audio Codec I2C and PA Control"
    depends on AUDIO_CODEC_ES8311_ES7210
    config AUDIO_CODEC_I2C_SDA_PIN
        int "Audio Codec I2C SDA Pin"
        default 39
        help
            Audio codec I2C SDA pin.
    config AUDIO_CODEC_I2C_SCL_PIN
        int "Audio Codec I2C SCL Pin"
        default 38
        help
            Audio codec I2C SCL pin.
    config AUDIO_CODEC_PA_PIN
        int "Audio Codec PA Pin"
        default 40
        help
            Audio codec PA pin.
    config AUDIO_CODEC_INPUT_REFERENCE
        bool "Audio Codec Input Reference"
        default y
        help
            Audio codec input reference.
 endmenu
 choice AUDIO_I2S_METHOD
    prompt "Audio I2S Method"
    default AUDIO_I2S_METHOD_SIMPLEX if AUDIO_CODEC_NONE
    default AUDIO_I2S_METHOD_DUPLEX if AUDIO_CODEC_ES8311_ES7210
    help
        Audio I2S method.
    config AUDIO_I2S_METHOD_SIMPLEX
        bool "Simplex"
        help
            Use I2S 0 as the audio input and I2S 1 as the audio output.
    config AUDIO_I2S_METHOD_DUPLEX
        bool "Duplex"
        help
            Use I2S 0 as the audio input and audio output.
 endchoice
 menu "Audio I2S Simplex"
    depends on AUDIO_I2S_METHOD_SIMPLEX
    config AUDIO_DEVICE_I2S_MIC_GPIO_WS
-    int "I2S GPIO WS"
+        int "I2S MIC GPIO WS"
        default 4
        help
            GPIO number of the I2S MIC WS.
    config AUDIO_DEVICE_I2S_MIC_GPIO_SCK
        int "I2S MIC GPIO BCLK"
        default 5
        help
            GPIO number of the I2S MIC SCK.
    config AUDIO_DEVICE_I2S_MIC_GPIO_DIN
        int "I2S MIC GPIO DIN"
        default 6
        help
            GPIO number of the I2S MIC DIN.
    config AUDIO_DEVICE_I2S_SPK_GPIO_DOUT
        int "I2S SPK GPIO DOUT"
        default 7
        help
            GPIO number of the I2S SPK DOUT.
    config AUDIO_DEVICE_I2S_SPK_GPIO_BCLK
        int "I2S SPK GPIO BCLK"
        default 15
        help
            GPIO number of the I2S SPK BCLK.
    config AUDIO_DEVICE_I2S_SPK_GPIO_LRCK
        int "I2S SPK GPIO WS"
        default 16
        help
            GPIO number of the I2S SPK LRCK.
 endmenu
 menu "Audio I2S Duplex"
    depends on AUDIO_I2S_METHOD_DUPLEX
    config AUDIO_DEVICE_I2S_GPIO_MCLK
        int "I2S GPIO MCLK"
        default -1
        help
            GPIO number of the I2S WS.
-config AUDIO_DEVICE_I2S_MIC_GPIO_BCLK
+    config AUDIO_DEVICE_I2S_GPIO_LRCK
-    int "I2S GPIO BCLK"
+        int "I2S GPIO LRCK"
        default 4
        help
            GPIO number of the I2S LRCK.
    config AUDIO_DEVICE_I2S_GPIO_BCLK
        int "I2S GPIO BCLK / SCLK"
        default 5
        help
            GPIO number of the I2S BCLK.
-config AUDIO_DEVICE_I2S_MIC_GPIO_DIN
+    config AUDIO_DEVICE_I2S_GPIO_DIN
        int "I2S GPIO DIN"
        default 6
        help
            GPIO number of the I2S DIN.
-config AUDIO_DEVICE_I2S_SPK_GPIO_DOUT
+    config AUDIO_DEVICE_I2S_GPIO_DOUT
        int "I2S GPIO DOUT"
        default 7
        help
            GPIO number of the I2S DOUT.
-config AUDIO_DEVICE_I2S_SIMPLEX
+endmenu
    bool "I2S Simplex"
    default y
    help
        Enable I2S Simplex mode.
 config AUDIO_DEVICE_I2S_SPK_GPIO_BCLK
    int "I2S SPK GPIO BCLK"
    default 15
    depends on AUDIO_DEVICE_I2S_SIMPLEX
    help
        GPIO number of the I2S MIC BCLK.
 config AUDIO_DEVICE_I2S_SPK_GPIO_WS
    int "I2S SPK GPIO WS"
    default 16
    depends on AUDIO_DEVICE_I2S_SIMPLEX
    help
        GPIO number of the I2S MIC WS.
 config BOOT_BUTTON_GPIO
    int "Boot Button GPIO"
@ -86,6 +173,12 @@ config VOLUME_UP_BUTTON_GPIO
    help
        GPIO number of the volume up button.
 config VOLUME_DOWN_BUTTON_GPIO
    int "Volume Down Button GPIO"
    default 39
    help
        GPIO number of the volume down button.
 config USE_AFE_SR
    bool "Use Espressif AFE SR"
    default y
--- a/main/WakeWordDetect.cc
+++ b/main/WakeWordDetect.cc
@ -15,6 +15,24 @@ WakeWordDetect::WakeWordDetect()
      wake_word_opus_() {
    event_group_ = xEventGroupCreate();
 }
 WakeWordDetect::~WakeWordDetect() {
    if (afe_detection_data_ != nullptr) {
        esp_afe_sr_v1.destroy(afe_detection_data_);
    }
    if (wake_word_encode_task_stack_ != nullptr) {
        free(wake_word_encode_task_stack_);
    }
    vEventGroupDelete(event_group_);
 }
 void WakeWordDetect::Initialize(int channels, bool reference) {
    channels_ = channels;
    reference_ = reference;
    int ref_num = reference_ ? 1 : 0;
    srmodel_list_t *models = esp_srmodel_init("model");
    for (int i = 0; i < models->num; i++) {
@ -25,7 +43,7 @@ WakeWordDetect::WakeWordDetect()
    }
    afe_config_t afe_config = {
-        .aec_init = false,
+        .aec_init = reference_,
        .se_init = true,
        .vad_init = true,
        .wakenet_init = true,
@ -37,17 +55,17 @@ WakeWordDetect::WakeWordDetect()
        .wakenet_model_name_2 = NULL,
        .wakenet_mode = DET_MODE_90,
        .afe_mode = SR_MODE_HIGH_PERF,
-        .afe_perferred_core = 0,
+        .afe_perferred_core = 1,
-        .afe_perferred_priority = 5,
+        .afe_perferred_priority = 1,
        .afe_ringbuf_size = 50,
        .memory_alloc_mode = AFE_MEMORY_ALLOC_MORE_PSRAM,
        .afe_linear_gain = 1.0,
        .agc_mode = AFE_MN_PEAK_AGC_MODE_2,
        .pcm_config = {
-            .total_ch_num = 1,
+            .total_ch_num = channels_,
-            .mic_num = 1,
+            .mic_num = channels_ - ref_num,
-            .ref_num = 0,
+            .ref_num = ref_num,
-            .sample_rate = CONFIG_AUDIO_INPUT_SAMPLE_RATE
+            .sample_rate = 16000
        },
        .debug_init = false,
        .debug_hook = {{ AFE_DEBUG_HOOK_MASE_TASK_IN, NULL }, { AFE_DEBUG_HOOK_FETCH_TASK_IN, NULL }},
@ -62,19 +80,7 @@ WakeWordDetect::WakeWordDetect()
        auto this_ = (WakeWordDetect*)arg;
        this_->AudioDetectionTask();
        vTaskDelete(NULL);
-    }, "audio_detection", 4096 * 2, this, 5, NULL);
+    }, "audio_detection", 4096 * 2, this, 1, NULL);
 }
 WakeWordDetect::~WakeWordDetect() {
    if (afe_detection_data_ != nullptr) {
        esp_afe_sr_v1.destroy(afe_detection_data_);
    }
    if (wake_word_encode_task_stack_ != nullptr) {
        free(wake_word_encode_task_stack_);
    }
    vEventGroupDelete(event_group_);
 }
 void WakeWordDetect::OnWakeWordDetected(std::function<void()> callback) {
@ -97,10 +103,10 @@ bool WakeWordDetect::IsDetectionRunning() {
    return xEventGroupGetBits(event_group_) & DETECTION_RUNNING_EVENT;
 }
-void WakeWordDetect::Feed(const int16_t* data, int size) {
+void WakeWordDetect::Feed(std::vector<int16_t>& data) {
-    input_buffer_.insert(input_buffer_.end(), data, data + size);
+    input_buffer_.insert(input_buffer_.end(), data.begin(), data.end());
-    auto chunk_size = esp_afe_sr_v1.get_feed_chunksize(afe_detection_data_);
+    auto chunk_size = esp_afe_sr_v1.get_feed_chunksize(afe_detection_data_) * channels_;
    while (input_buffer_.size() >= chunk_size) {
        esp_afe_sr_v1.feed(afe_detection_data_, input_buffer_.data());
        input_buffer_.erase(input_buffer_.begin(), input_buffer_.begin() + chunk_size);
@ -166,7 +172,7 @@ void WakeWordDetect::EncodeWakeWordData() {
        auto start_time = esp_timer_get_time();
        // encode detect packets
        OpusEncoder* encoder = new OpusEncoder();
-        encoder->Configure(CONFIG_AUDIO_INPUT_SAMPLE_RATE, 1, 60);
+        encoder->Configure(16000, 1, 60);
        encoder->SetComplexity(0);
        this_->wake_word_opus_.resize(4096 * 4);
        size_t offset = 0;
--- a/main/WakeWordDetect.h
+++ b/main/WakeWordDetect.h
@ -19,7 +19,8 @@ public:
    WakeWordDetect();
    ~WakeWordDetect();
-    void Feed(const int16_t* data, int size);
+    void Initialize(int channels, bool reference);
    void Feed(std::vector<int16_t>& data);
    void OnWakeWordDetected(std::function<void()> callback);
    void OnVadStateChange(std::function<void(bool speaking)> callback);
    void StartDetection();
@ -36,6 +37,8 @@ private:
    std::function<void()> wake_word_detected_callback_;
    std::function<void(bool speaking)> vad_state_change_callback_;
    bool is_speaking_ = false;
    int channels_;
    bool reference_;
    TaskHandle_t wake_word_encode_task_ = nullptr;
    StaticTask_t wake_word_encode_task_buffer_;
--- a/main/idf_component.yml
+++ b/main/idf_component.yml
@ -1,9 +1,10 @@
 ## IDF Component Manager Manifest File
 dependencies:
  78/esp-builtin-led: "^1.0.2"
-  78/esp-wifi-connect: "^1.1.0"
+  78/esp-wifi-connect: "^1.2.0"
  78/esp-opus-encoder: "^1.0.2"
  78/esp-ml307: "^1.2.1"
  espressif/esp_codec_dev: "^1.3.1"
  espressif/esp-sr: "^1.9.0"
  espressif/button: "^3.3.1"
  lvgl/lvgl: "^8.4.0"
--- a/main/main.cc
+++ b/main/main.cc
@ -13,6 +13,19 @@
 extern "C" void app_main(void)
 {
 #ifdef CONFIG_AUDIO_CODEC_ES8311_ES7210
    // Make GPIO15 HIGH to enable the 4G module
    gpio_config_t ml307_enable_config = {
        .pin_bit_mask = (1ULL << 15),
        .mode = GPIO_MODE_OUTPUT,
        .pull_up_en = GPIO_PULLUP_DISABLE,
        .pull_down_en = GPIO_PULLDOWN_DISABLE,
        .intr_type = GPIO_INTR_DISABLE,
    };
    gpio_config(&ml307_enable_config);
    gpio_set_level(GPIO_NUM_15, 1);
 #endif
    // Check if the reset button is pressed
    SystemReset system_reset;
    system_reset.CheckButtons();
--- a/sdkconfig.box
+++ b/sdkconfig.box