Add wake word to xmini-c3 (#730)

* esp-hi: MCP protocol is not ready yet

* Add wake word to xmini-c3
This commit is contained in:
Xiaoxia 2025-05-31 22:21:03 +08:00 committed by GitHub
parent 6cb025859f
commit ae57131c15
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
27 changed files with 399 additions and 155 deletions

1
.gitignore vendored
View File

@ -10,5 +10,6 @@ dependencies.lock
.env
releases/
main/assets/lang_config.h
main/mmap_generate_emoji.h
.DS_Store
.cache

View File

@ -194,13 +194,14 @@ list(APPEND SOURCES ${BOARD_SOURCES})
if(CONFIG_USE_AUDIO_PROCESSOR)
list(APPEND SOURCES "audio_processing/afe_audio_processor.cc")
else()
list(APPEND SOURCES "audio_processing/dummy_audio_processor.cc")
list(APPEND SOURCES "audio_processing/no_audio_processor.cc")
endif()
if(CONFIG_USE_WAKE_WORD_DETECT)
list(APPEND SOURCES "audio_processing/wake_word_detect.cc")
endif()
if(CONFIG_USE_WAKE_WORD_DETECT_NO_AFE)
list(APPEND SOURCES "audio_processing/wake_word_no_afe.cc")
if(CONFIG_USE_AFE_WAKE_WORD)
list(APPEND SOURCES "audio_processing/afe_wake_word.cc")
elseif(CONFIG_USE_ESP_WAKE_WORD)
list(APPEND SOURCES "audio_processing/esp_wake_word.cc")
else()
list(APPEND SOURCES "audio_processing/no_wake_word.cc")
endif()
# Kconfig

View File

@ -30,152 +30,226 @@ choice BOARD_TYPE
Board type. 开发板类型
config BOARD_TYPE_BREAD_COMPACT_WIFI
bool "面包板新版接线WiFi"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_BREAD_COMPACT_WIFI_LCD
bool "面包板新版接线WiFi+ LCD"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_BREAD_COMPACT_ML307
bool "面包板新版接线ML307 AT"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_BREAD_COMPACT_ESP32
bool "面包板WiFi ESP32 DevKit"
depends on IDF_TARGET_ESP32
config BOARD_TYPE_BREAD_COMPACT_ESP32_LCD
bool "面包板WiFi+ LCD ESP32 DevKit"
depends on IDF_TARGET_ESP32
config BOARD_TYPE_XMINI_C3
bool "虾哥 Mini C3"
depends on IDF_TARGET_ESP32C3
config BOARD_TYPE_ESP32S3_KORVO2_V3
bool "ESP32S3_KORVO2_V3开发板"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_ESP_SPARKBOT
bool "ESP-SparkBot开发板"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_ESP_SPOT_S3
bool "ESP-Spot-S3"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_ESP_HI
bool "ESP-HI"
depends on IDF_TARGET_ESP32C3
config BOARD_TYPE_ESP_BOX_3
bool "ESP BOX 3"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_ESP_BOX
bool "ESP BOX"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_ESP_BOX_LITE
bool "ESP BOX Lite"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_KEVIN_BOX_1
bool "Kevin Box 1"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_KEVIN_BOX_2
bool "Kevin Box 2"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_KEVIN_C3
bool "Kevin C3"
depends on IDF_TARGET_ESP32C3
config BOARD_TYPE_KEVIN_SP_V3_DEV
bool "Kevin SP V3开发板"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_KEVIN_SP_V4_DEV
bool "Kevin SP V4开发板"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_ESP32_CGC
bool "ESP32 CGC"
depends on IDF_TARGET_ESP32
config BOARD_TYPE_KEVIN_YUYING_313LCD
bool "鱼鹰科技3.13LCD开发板"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_LICHUANG_DEV
bool "立创·实战派ESP32-S3开发板"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_LICHUANG_C3_DEV
bool "立创·实战派ESP32-C3开发板"
depends on IDF_TARGET_ESP32C3
config BOARD_TYPE_DF_K10
bool "DFRobot 行空板 k10"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_DF_S3_AI_CAM
bool "DFRobot ESP32-S3 AI智能摄像头模块"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_MAGICLICK_2P4
bool "神奇按钮 Magiclick_2.4"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_MAGICLICK_2P5
bool "神奇按钮 Magiclick_2.5"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_MAGICLICK_C3
bool "神奇按钮 Magiclick_C3"
depends on IDF_TARGET_ESP32C3
config BOARD_TYPE_MAGICLICK_C3_V2
bool "神奇按钮 Magiclick_C3_v2"
depends on IDF_TARGET_ESP32C3
config BOARD_TYPE_M5STACK_CORE_S3
bool "M5Stack CoreS3"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_M5STACK_CORE_TAB5
bool "M5Stack Tab5"
depends on IDF_TARGET_ESP32P4
config BOARD_TYPE_ATOMS3_ECHO_BASE
bool "AtomS3 + Echo Base"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_ATOMS3R_ECHO_BASE
bool "AtomS3R + Echo Base"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_ATOMS3R_CAM_M12_ECHO_BASE
bool "AtomS3R CAM/M12 + Echo Base"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_ATOMMATRIX_ECHO_BASE
bool "AtomMatrix + Echo Base"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_ESP32S3_Touch_AMOLED_1_8
bool "Waveshare ESP32-S3-Touch-AMOLED-1.8"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_ESP32S3_Touch_AMOLED_1_75
bool "Waveshare ESP32-S3-Touch-AMOLED-1.75"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_ESP32S3_Touch_LCD_1_85C
bool "Waveshare ESP32-S3-Touch-LCD-1.85C"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_ESP32S3_Touch_LCD_1_85
bool "Waveshare ESP32-S3-Touch-LCD-1.85"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_ESP32S3_Touch_LCD_1_46
bool "Waveshare ESP32-S3-Touch-LCD-1.46"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_ESP32S3_Touch_LCD_3_5
bool "Waveshare ESP32-S3-Touch-LCD-3.5"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_ESP32P4_NANO
bool "Waveshare ESP32-P4-NANO"
depends on IDF_TARGET_ESP32P4
config BOARD_TYPE_ESP32P4_WIFI6_Touch_LCD_4B
bool "Waveshare ESP32-P4-WIFI6-Touch-LCD-4B"
depends on IDF_TARGET_ESP32P4
config BOARD_TYPE_ESP32P4_WIFI6_Touch_LCD_XC
bool "Waveshare ESP32-P4-WIFI6-Touch-LCD-3.4C or ESP32-P4-WIFI6-Touch-LCD-4C"
depends on IDF_TARGET_ESP32P4
config BOARD_TYPE_TUDOUZI
bool "土豆子"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_LILYGO_T_CIRCLE_S3
bool "LILYGO T-Circle-S3"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_LILYGO_T_CAMERAPLUS_S3_V1_0_V1_1
bool "LILYGO T-CameraPlus-S3_V1_0_V1_1"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_LILYGO_T_CAMERAPLUS_S3_V1_2
bool "LILYGO T-CameraPlus-S3_V1_2"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_LILYGO_T_DISPLAY_S3_PRO_MVSRLORA
bool "LILYGO T-Display-S3-Pro-MVSRLora"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_LILYGO_T_DISPLAY_S3_PRO_MVSRLORA_NO_BATTERY
bool "LILYGO T-Display-S3-Pro-MVSRLora_No_Battery"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_MOVECALL_MOJI_ESP32S3
bool "Movecall Moji 小智AI衍生版"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_MOVECALL_CUICAN_ESP32S3
bool "Movecall CuiCan 璀璨·AI吊坠"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_ATK_DNESP32S3
bool "正点原子DNESP32S3开发板"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_ATK_DNESP32S3_BOX
bool "正点原子DNESP32S3-BOX"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_ATK_DNESP32S3_BOX0
bool "正点原子DNESP32S3-BOX0"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_ATK_DNESP32S3M_WIFI
bool "正点原子DNESP32S3M-WIFI"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_ATK_DNESP32S3M_4G
bool "正点原子DNESP32S3M-4G"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_DU_CHATX
bool "嘟嘟开发板CHATX(wifi)"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_ESP32S3_Taiji_Pi
bool "太极小派esp32s3"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_XINGZHI_Cube_0_85TFT_WIFI
bool "无名科技星智0.85(WIFI)"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_XINGZHI_Cube_0_85TFT_ML307
bool "无名科技星智0.85(ML307)"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_XINGZHI_Cube_0_96OLED_WIFI
bool "无名科技星智0.96(WIFI)"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_XINGZHI_Cube_0_96OLED_ML307
bool "无名科技星智0.96(ML307)"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_XINGZHI_Cube_1_54TFT_WIFI
bool "无名科技星智1.54(WIFI)"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_XINGZHI_Cube_1_54TFT_ML307
bool "无名科技星智1.54(ML307)"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_SENSECAP_WATCHER
bool "SenseCAP Watcher"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_DOIT_S3_AIBOX
bool "四博智联AI陪伴盒子"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_MIXGO_NOVA
bool "元控·青春"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_GENJUTECH_S3_1_54TFT
bool "亘具科技1.54(s3)"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_ESP_S3_LCD_EV_Board
bool "乐鑫ESP S3 LCD EV Board开发板"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_ZHENGCHEN_1_54TFT_WIFI
bool "征辰科技1.54(WIFI)"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_ZHENGCHEN_1_54TFT_ML307
bool "征辰科技1.54(ML307)"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_MINSI_K08_DUAL
bool "敏思科技K08(DUAL)"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_ESP32_S3_1_54_MUMA
bool "Spotpear ESP32-S3-1.54-MUMA"
depends on IDF_TARGET_ESP32S3
config BOARD_TYPE_ESP32_S3_1_28_BOX
bool "Spotpear ESP32-S3-1.28-BOX"
depends on IDF_TARGET_ESP32S3
endchoice
choice ESP_S3_LCD_EV_Board_Version_TYPE
@ -270,24 +344,26 @@ config USE_WECHAT_MESSAGE_STYLE
help
使用微信聊天界面风格
config USE_WAKE_WORD_DETECT_NO_AFE
config USE_ESP_WAKE_WORD
bool "Enable Wake Word Detection (without AFE)"
default y
depends on IDF_TARGET_ESP32C3 || IDF_TARGET_ESP32C5
config USE_WAKE_WORD_DETECT
bool "Enable Wake Word Detection"
default y
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 && SPIRAM
help
需要 ESP32 S3 与 AFE 支持
支持 ESP32 C3 与 ESP32 C5
config USE_AFE_WAKE_WORD
bool "Enable Wake Word Detection (AFE)"
default n
depends on (IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4) && SPIRAM
help
需要 ESP32 S3 与 PSRAM 支持
config USE_AUDIO_PROCESSOR
bool "Enable Audio Noise Reduction"
default y
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 && SPIRAM
depends on (IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4) && SPIRAM
help
需要 ESP32 S3 与 AFE 支持
需要 ESP32 S3 与 PSRAM 支持
config USE_DEVICE_AEC
bool "Enable Device-Side AEC"
@ -297,7 +373,7 @@ config USE_DEVICE_AEC
因为性能不够,不建议和微信聊天界面风格同时开启
config USE_SERVER_AEC
bool "Enable Server-Side AEC"
bool "Enable Server-Side AEC (Unstable)"
default n
depends on USE_AUDIO_PROCESSOR
help

View File

@ -14,7 +14,15 @@
#if CONFIG_USE_AUDIO_PROCESSOR
#include "afe_audio_processor.h"
#else
#include "dummy_audio_processor.h"
#include "no_audio_processor.h"
#endif
#if CONFIG_USE_AFE_WAKE_WORD
#include "afe_wake_word.h"
#elif CONFIG_USE_ESP_WAKE_WORD
#include "esp_wake_word.h"
#else
#include "no_wake_word.h"
#endif
#include <cstring>
@ -55,7 +63,15 @@ Application::Application() {
#if CONFIG_USE_AUDIO_PROCESSOR
audio_processor_ = std::make_unique<AfeAudioProcessor>();
#else
audio_processor_ = std::make_unique<DummyAudioProcessor>();
audio_processor_ = std::make_unique<NoAudioProcessor>();
#endif
#if CONFIG_USE_AFE_WAKE_WORD
wake_word_ = std::make_unique<AfeWakeWord>();
#elif CONFIG_USE_ESP_WAKE_WORD
wake_word_ = std::make_unique<EspWakeWord>();
#else
wake_word_ = std::make_unique<NoWakeWord>();
#endif
esp_timer_create_args_t clock_timer_args = {
@ -129,9 +145,7 @@ void Application::CheckNewVersion() {
auto& board = Board::GetInstance();
board.SetPowerSaveMode(false);
#if CONFIG_USE_WAKE_WORD_DETECT || CONFIG_USE_WAKE_WORD_DETECT_NO_AFE
wake_word_detect_.StopDetection();
#endif
wake_word_->StopDetection();
// 预先关闭音频输出,避免升级过程有音频操作
auto codec = board.GetAudioCodec();
codec->EnableInput(false);
@ -256,8 +270,6 @@ void Application::PlaySound(const std::string_view& sound) {
}
background_task_->WaitForCompletion();
// The assets are encoded at 16000Hz, 60ms frame duration
SetDecodeSampleRate(16000, 60);
const char* data = sound.data();
size_t size = sound.size();
for (const char* p = data; p < data + size; ) {
@ -266,6 +278,8 @@ void Application::PlaySound(const std::string_view& sound) {
auto payload_size = ntohs(p3->payload_size);
AudioStreamPacket packet;
packet.sample_rate = 16000;
packet.frame_duration = 60;
packet.payload.resize(payload_size);
memcpy(packet.payload.data(), p3->payload, payload_size);
p += payload_size;
@ -432,7 +446,7 @@ void Application::Start() {
});
protocol_->OnIncomingAudio([this](AudioStreamPacket&& packet) {
std::lock_guard<std::mutex> lock(mutex_);
if (audio_decode_queue_.size() < MAX_AUDIO_PACKETS_IN_QUEUE) {
if (device_state_ == kDeviceStateSpeaking && audio_decode_queue_.size() < MAX_AUDIO_PACKETS_IN_QUEUE) {
audio_decode_queue_.emplace_back(std::move(packet));
}
});
@ -442,7 +456,6 @@ void Application::Start() {
ESP_LOGW(TAG, "Server sample rate %d does not match device output sample rate %d, resampling may cause distortion",
protocol_->server_sample_rate(), codec->output_sample_rate());
}
SetDecodeSampleRate(protocol_->server_sample_rate(), protocol_->server_frame_duration());
#if CONFIG_IOT_PROTOCOL_XIAOZHI
auto& thing_manager = iot::ThingManager::GetInstance();
@ -600,28 +613,40 @@ void Application::Start() {
}
});
#if CONFIG_USE_WAKE_WORD_DETECT || CONFIG_USE_WAKE_WORD_DETECT_NO_AFE
wake_word_detect_.Initialize(codec);
#ifdef CONFIG_USE_WAKE_WORD_DETECT
wake_word_detect_.OnWakeWordDetected([this](const std::string& wake_word) {
wake_word_->Initialize(codec);
wake_word_->OnWakeWordDetected([this](const std::string& wake_word) {
Schedule([this, &wake_word]() {
if (device_state_ == kDeviceStateIdle) {
SetDeviceState(kDeviceStateConnecting);
wake_word_detect_.EncodeWakeWordData();
if (!protocol_) {
return;
}
if (!protocol_ || !protocol_->OpenAudioChannel()) {
wake_word_detect_.StartDetection();
return;
if (device_state_ == kDeviceStateIdle) {
wake_word_->EncodeWakeWordData();
if (!protocol_->IsAudioChannelOpened()) {
SetDeviceState(kDeviceStateConnecting);
if (!protocol_->OpenAudioChannel()) {
wake_word_->StartDetection();
return;
}
}
ESP_LOGI(TAG, "Wake word detected: %s", wake_word.c_str());
#if CONFIG_USE_AFE_WAKE_WORD
AudioStreamPacket packet;
// Encode and send the wake word data to the server
while (wake_word_detect_.GetWakeWordOpus(packet.payload)) {
while (wake_word_->GetWakeWordOpus(packet.payload)) {
protocol_->SendAudio(packet);
}
// Set the chat state to wake word detected
protocol_->SendWakeWordDetected(wake_word);
ESP_LOGI(TAG, "Wake word detected: %s", wake_word.c_str());
#else
// Play the pop up sound to indicate the wake word is detected
// And wait 60ms to make sure the queue has been processed by audio task
ResetDecoder();
PlaySound(Lang::Sounds::P3_POPUP);
vTaskDelay(pdMS_TO_TICKS(60));
#endif
SetListeningMode(aec_mode_ == kAecOff ? kListeningModeAutoStop : kListeningModeRealtime);
} else if (device_state_ == kDeviceStateSpeaking) {
AbortSpeaking(kAbortReasonWakeWordDetected);
@ -630,9 +655,7 @@ void Application::Start() {
}
});
});
#endif
wake_word_detect_.StartDetection();
#endif
wake_word_->StartDetection();
// Wait for the new version check to finish
xEventGroupWaitBits(event_group_, CHECK_NEW_VERSION_DONE_EVENT, pdTRUE, pdFALSE, portMAX_DELAY);
@ -751,17 +774,14 @@ void Application::OnAudioOutput() {
return;
}
if (device_state_ == kDeviceStateListening) {
audio_decode_queue_.clear();
audio_decode_cv_.notify_all();
return;
}
auto packet = std::move(audio_decode_queue_.front());
audio_decode_queue_.pop_front();
lock.unlock();
audio_decode_cv_.notify_all();
// Synchronize the sample rate and frame duration
SetDecodeSampleRate(packet.sample_rate, packet.frame_duration);
busy_decoding_audio_ = true;
background_task_->Schedule([this, codec, packet = std::move(packet)]() mutable {
busy_decoding_audio_ = false;
@ -782,45 +802,48 @@ void Application::OnAudioOutput() {
}
codec->OutputData(pcm);
#ifdef CONFIG_USE_SERVER_AEC
std::lock_guard<std::mutex> lock(timestamp_mutex_);
timestamp_queue_.push_back(packet.timestamp);
last_output_timestamp_ = packet.timestamp;
std::lock_guard<std::mutex> lock(timestamp_mutex_);
timestamp_queue_.push_back(packet.timestamp);
#endif
last_output_time_ = std::chrono::steady_clock::now();
});
}
void Application::OnAudioInput() {
#if CONFIG_USE_WAKE_WORD_DETECT || CONFIG_USE_WAKE_WORD_DETECT_NO_AFE
if (wake_word_detect_.IsDetectionRunning()) {
if (wake_word_->IsDetectionRunning()) {
std::vector<int16_t> data;
int samples = wake_word_detect_.GetFeedSize();
int samples = wake_word_->GetFeedSize();
if (samples > 0) {
ReadAudio(data, 16000, samples);
wake_word_detect_.Feed(data);
return;
if (ReadAudio(data, 16000, samples)) {
wake_word_->Feed(data);
return;
}
}
}
#endif
if (audio_processor_->IsRunning()) {
std::vector<int16_t> data;
int samples = audio_processor_->GetFeedSize();
if (samples > 0) {
ReadAudio(data, 16000, samples);
audio_processor_->Feed(data);
return;
if (ReadAudio(data, 16000, samples)) {
audio_processor_->Feed(data);
return;
}
}
}
vTaskDelay(pdMS_TO_TICKS(OPUS_FRAME_DURATION_MS / 2));
}
void Application::ReadAudio(std::vector<int16_t>& data, int sample_rate, int samples) {
bool Application::ReadAudio(std::vector<int16_t>& data, int sample_rate, int samples) {
auto codec = Board::GetInstance().GetAudioCodec();
if (!codec->input_enabled()) {
return false;
}
if (codec->input_sample_rate() != sample_rate) {
data.resize(samples * codec->input_sample_rate() / sample_rate);
if (!codec->InputData(data)) {
return;
return false;
}
if (codec->input_channels() == 2) {
auto mic_channel = std::vector<int16_t>(data.size() / 2);
@ -846,9 +869,10 @@ void Application::ReadAudio(std::vector<int16_t>& data, int sample_rate, int sam
} else {
data.resize(samples);
if (!codec->InputData(data)) {
return;
return false;
}
}
return true;
}
void Application::AbortSpeaking(AbortReason reason) {
@ -884,17 +908,13 @@ void Application::SetDeviceState(DeviceState state) {
display->SetStatus(Lang::Strings::STANDBY);
display->SetEmotion("neutral");
audio_processor_->Stop();
#if CONFIG_USE_WAKE_WORD_DETECT || CONFIG_USE_WAKE_WORD_DETECT_NO_AFE
wake_word_detect_.StartDetection();
#endif
wake_word_->StartDetection();
break;
case kDeviceStateConnecting:
display->SetStatus(Lang::Strings::CONNECTING);
display->SetEmotion("neutral");
display->SetChatMessage("system", "");
timestamp_queue_.clear();
last_output_timestamp_ = 0;
break;
case kDeviceStateListening:
display->SetStatus(Lang::Strings::LISTENING);
@ -909,14 +929,14 @@ void Application::SetDeviceState(DeviceState state) {
// Send the start listening command
protocol_->SendStartListening(listening_mode_);
if (previous_state == kDeviceStateSpeaking) {
audio_decode_queue_.clear();
audio_decode_cv_.notify_all();
// FIXME: Wait for the speaker to empty the buffer
vTaskDelay(pdMS_TO_TICKS(120));
}
opus_encoder_->ResetState();
#if CONFIG_USE_WAKE_WORD_DETECT || CONFIG_USE_WAKE_WORD_DETECT_NO_AFE
wake_word_detect_.StopDetection();
#endif
audio_processor_->Start();
wake_word_->StopDetection();
}
break;
case kDeviceStateSpeaking:
@ -924,8 +944,11 @@ void Application::SetDeviceState(DeviceState state) {
if (listening_mode_ != kListeningModeRealtime) {
audio_processor_->Stop();
#if CONFIG_USE_WAKE_WORD_DETECT || CONFIG_USE_WAKE_WORD_DETECT_NO_AFE
wake_word_detect_.StartDetection();
// Only AFE wake word can be detected in speaking mode
#if CONFIG_USE_AFE_WAKE_WORD
wake_word_->StartDetection();
#else
wake_word_->StopDetection();
#endif
}
ResetDecoder();

View File

@ -21,12 +21,7 @@
#include "ota.h"
#include "background_task.h"
#include "audio_processor.h"
#if CONFIG_USE_WAKE_WORD_DETECT
#include "wake_word_detect.h"
#elif CONFIG_USE_WAKE_WORD_DETECT_NO_AFE
#include "wake_word_no_afe.h"
#endif
#include "wake_word.h"
#define SCHEDULE_EVENT (1 << 0)
#define SEND_AUDIO_EVENT (1 << 1)
@ -83,14 +78,13 @@ public:
void SendMcpMessage(const std::string& payload);
void SetAecMode(AecMode mode);
AecMode GetAecMode() const { return aec_mode_; }
BackgroundTask* GetBackgroundTask() const { return background_task_; }
private:
Application();
~Application();
#if CONFIG_USE_WAKE_WORD_DETECT || CONFIG_USE_WAKE_WORD_DETECT_NO_AFE
WakeWordDetect wake_word_detect_;
#endif
std::unique_ptr<WakeWord> wake_word_;
std::unique_ptr<AudioProcessor> audio_processor_;
Ota ota_;
std::mutex mutex_;
@ -119,7 +113,6 @@ private:
// 新增用于维护音频包的timestamp队列
std::list<uint32_t> timestamp_queue_;
std::mutex timestamp_mutex_;
std::atomic<uint32_t> last_output_timestamp_ = 0;
std::unique_ptr<OpusEncoderWrapper> opus_encoder_;
std::unique_ptr<OpusDecoderWrapper> opus_decoder_;
@ -131,7 +124,7 @@ private:
void MainEventLoop();
void OnAudioInput();
void OnAudioOutput();
void ReadAudio(std::vector<int16_t>& data, int sample_rate, int samples);
bool ReadAudio(std::vector<int16_t>& data, int sample_rate, int samples);
void ResetDecoder();
void SetDecodeSampleRate(int sample_rate, int frame_duration);
void CheckNewVersion();

BIN
main/assets/common/popup.p3 Normal file

Binary file not shown.

View File

@ -3,7 +3,7 @@
#define PROCESSOR_RUNNING 0x01
static const char* TAG = "AfeAudioProcessor";
#define TAG "AfeAudioProcessor"
AfeAudioProcessor::AfeAudioProcessor()
: afe_data_(nullptr) {

View File

@ -1,4 +1,4 @@
#include "wake_word_detect.h"
#include "afe_wake_word.h"
#include "application.h"
#include <esp_log.h>
@ -8,9 +8,9 @@
#define DETECTION_RUNNING_EVENT 1
static const char* TAG = "WakeWordDetect";
#define TAG "AfeWakeWord"
WakeWordDetect::WakeWordDetect()
AfeWakeWord::AfeWakeWord()
: afe_data_(nullptr),
wake_word_pcm_(),
wake_word_opus_() {
@ -18,7 +18,7 @@ WakeWordDetect::WakeWordDetect()
event_group_ = xEventGroupCreate();
}
WakeWordDetect::~WakeWordDetect() {
AfeWakeWord::~AfeWakeWord() {
if (afe_data_ != nullptr) {
afe_iface_->destroy(afe_data_);
}
@ -30,7 +30,7 @@ WakeWordDetect::~WakeWordDetect() {
vEventGroupDelete(event_group_);
}
void WakeWordDetect::Initialize(AudioCodec* codec) {
void AfeWakeWord::Initialize(AudioCodec* codec) {
codec_ = codec;
int ref_num = codec_->input_reference() ? 1 : 0;
@ -67,46 +67,46 @@ void WakeWordDetect::Initialize(AudioCodec* codec) {
afe_data_ = afe_iface_->create_from_config(afe_config);
xTaskCreate([](void* arg) {
auto this_ = (WakeWordDetect*)arg;
auto this_ = (AfeWakeWord*)arg;
this_->AudioDetectionTask();
vTaskDelete(NULL);
}, "audio_detection", 4096, this, 3, nullptr);
}
void WakeWordDetect::OnWakeWordDetected(std::function<void(const std::string& wake_word)> callback) {
void AfeWakeWord::OnWakeWordDetected(std::function<void(const std::string& wake_word)> callback) {
wake_word_detected_callback_ = callback;
}
void WakeWordDetect::StartDetection() {
void AfeWakeWord::StartDetection() {
xEventGroupSetBits(event_group_, DETECTION_RUNNING_EVENT);
}
void WakeWordDetect::StopDetection() {
void AfeWakeWord::StopDetection() {
xEventGroupClearBits(event_group_, DETECTION_RUNNING_EVENT);
if (afe_data_ != nullptr) {
afe_iface_->reset_buffer(afe_data_);
}
}
bool WakeWordDetect::IsDetectionRunning() {
bool AfeWakeWord::IsDetectionRunning() {
return xEventGroupGetBits(event_group_) & DETECTION_RUNNING_EVENT;
}
void WakeWordDetect::Feed(const std::vector<int16_t>& data) {
void AfeWakeWord::Feed(const std::vector<int16_t>& data) {
if (afe_data_ == nullptr) {
return;
}
afe_iface_->feed(afe_data_, data.data());
}
size_t WakeWordDetect::GetFeedSize() {
size_t AfeWakeWord::GetFeedSize() {
if (afe_data_ == nullptr) {
return 0;
}
return afe_iface_->get_feed_chunksize(afe_data_) * codec_->input_channels();
}
void WakeWordDetect::AudioDetectionTask() {
void AfeWakeWord::AudioDetectionTask() {
auto fetch_size = afe_iface_->get_fetch_chunksize(afe_data_);
auto feed_size = afe_iface_->get_feed_chunksize(afe_data_);
ESP_LOGI(TAG, "Audio detection task started, feed size: %d fetch size: %d",
@ -121,7 +121,7 @@ void WakeWordDetect::AudioDetectionTask() {
}
// Store the wake word data for voice recognition, like who is speaking
StoreWakeWordData((uint16_t*)res->data, res->data_size / sizeof(uint16_t));
StoreWakeWordData(res->data, res->data_size / sizeof(int16_t));
if (res->wakeup_state == WAKENET_DETECTED) {
StopDetection();
@ -134,7 +134,7 @@ void WakeWordDetect::AudioDetectionTask() {
}
}
void WakeWordDetect::StoreWakeWordData(uint16_t* data, size_t samples) {
void AfeWakeWord::StoreWakeWordData(const int16_t* data, size_t samples) {
// store audio data to wake_word_pcm_
wake_word_pcm_.emplace_back(std::vector<int16_t>(data, data + samples));
// keep about 2 seconds of data, detect duration is 30ms (sample_rate == 16000, chunksize == 512)
@ -143,13 +143,13 @@ void WakeWordDetect::StoreWakeWordData(uint16_t* data, size_t samples) {
}
}
void WakeWordDetect::EncodeWakeWordData() {
void AfeWakeWord::EncodeWakeWordData() {
wake_word_opus_.clear();
if (wake_word_encode_task_stack_ == nullptr) {
wake_word_encode_task_stack_ = (StackType_t*)heap_caps_malloc(4096 * 8, MALLOC_CAP_SPIRAM);
}
wake_word_encode_task_ = xTaskCreateStatic([](void* arg) {
auto this_ = (WakeWordDetect*)arg;
auto this_ = (AfeWakeWord*)arg;
{
auto start_time = esp_timer_get_time();
auto encoder = std::make_unique<OpusEncoderWrapper>(16000, 1, OPUS_FRAME_DURATION_MS);
@ -176,7 +176,7 @@ void WakeWordDetect::EncodeWakeWordData() {
}, "encode_detect_packets", 4096 * 8, this, 2, wake_word_encode_task_stack_, &wake_word_encode_task_buffer_);
}
bool WakeWordDetect::GetWakeWordOpus(std::vector<uint8_t>& opus) {
bool AfeWakeWord::GetWakeWordOpus(std::vector<uint8_t>& opus) {
std::unique_lock<std::mutex> lock(wake_word_mutex_);
wake_word_cv_.wait(lock, [this]() {
return !wake_word_opus_.empty();

View File

@ -1,5 +1,5 @@
#ifndef WAKE_WORD_DETECT_H
#define WAKE_WORD_DETECT_H
#ifndef AFE_WAKE_WORD_H
#define AFE_WAKE_WORD_H
#include <freertos/FreeRTOS.h>
#include <freertos/task.h>
@ -16,11 +16,12 @@
#include <condition_variable>
#include "audio_codec.h"
#include "wake_word.h"
class WakeWordDetect {
class AfeWakeWord : public WakeWord {
public:
WakeWordDetect();
~WakeWordDetect();
AfeWakeWord();
~AfeWakeWord();
void Initialize(AudioCodec* codec);
void Feed(const std::vector<int16_t>& data);
@ -51,7 +52,7 @@ private:
std::mutex wake_word_mutex_;
std::condition_variable wake_word_cv_;
void StoreWakeWordData(uint16_t* data, size_t size);
void StoreWakeWordData(const int16_t* data, size_t size);
void AudioDetectionTask();
};

View File

@ -1,4 +1,4 @@
#include "wake_word_no_afe.h"
#include "esp_wake_word.h"
#include "application.h"
#include <esp_log.h>
@ -8,13 +8,13 @@
#define DETECTION_RUNNING_EVENT 1
static const char* TAG = "WakeWordDetect";
#define TAG "EspWakeWord"
WakeWordDetect::WakeWordDetect() {
EspWakeWord::EspWakeWord() {
event_group_ = xEventGroupCreate();
}
WakeWordDetect::~WakeWordDetect() {
EspWakeWord::~EspWakeWord() {
if (wakenet_data_ != nullptr) {
wakenet_iface_->destroy(wakenet_data_);
esp_srmodel_deinit(wakenet_model_);
@ -23,13 +23,16 @@ WakeWordDetect::~WakeWordDetect() {
vEventGroupDelete(event_group_);
}
void WakeWordDetect::Initialize(AudioCodec* codec) {
void EspWakeWord::Initialize(AudioCodec* codec) {
codec_ = codec;
wakenet_model_ = esp_srmodel_init("model");
if(wakenet_model_->num > 1) {
ESP_LOGW(TAG, "More than one model found, using the first one");
} else if (wakenet_model_->num == 0) {
ESP_LOGE(TAG, "No model found");
return;
}
char *model_name = wakenet_model_->model_name[0];
wakenet_iface_ = (esp_wn_iface_t*)esp_wn_handle_from_name(model_name);
@ -40,28 +43,46 @@ void WakeWordDetect::Initialize(AudioCodec* codec) {
ESP_LOGI(TAG, "Wake word(%s),freq: %d, chunksize: %d", model_name, frequency, audio_chunksize);
}
void WakeWordDetect::StartDetection() {
void EspWakeWord::OnWakeWordDetected(std::function<void(const std::string& wake_word)> callback) {
wake_word_detected_callback_ = callback;
}
void EspWakeWord::StartDetection() {
ESP_LOGI(TAG, "Start wake word detection");
xEventGroupSetBits(event_group_, DETECTION_RUNNING_EVENT);
}
void WakeWordDetect::StopDetection() {
void EspWakeWord::StopDetection() {
ESP_LOGI(TAG, "Stop wake word detection");
xEventGroupClearBits(event_group_, DETECTION_RUNNING_EVENT);
}
bool WakeWordDetect::IsDetectionRunning() {
bool EspWakeWord::IsDetectionRunning() {
return xEventGroupGetBits(event_group_) & DETECTION_RUNNING_EVENT;
}
void WakeWordDetect::Feed(const std::vector<int16_t>& data) {
void EspWakeWord::Feed(const std::vector<int16_t>& data) {
int res = wakenet_iface_->detect(wakenet_data_, (int16_t *)data.data());
if (res > 0) {
ESP_LOGI(TAG, "Wake word detected");
auto& app = Application::GetInstance();
app.ToggleChatState();
StopDetection();
last_detected_wake_word_ = wakenet_iface_->get_word_name(wakenet_data_, res);
if (wake_word_detected_callback_) {
wake_word_detected_callback_(last_detected_wake_word_);
}
}
}
size_t WakeWordDetect::GetFeedSize() {
size_t EspWakeWord::GetFeedSize() {
if (wakenet_data_ == nullptr) {
return 0;
}
return wakenet_iface_->get_samp_chunksize(wakenet_data_) * codec_->input_channels();
}
void EspWakeWord::EncodeWakeWordData() {
}
bool EspWakeWord::GetWakeWordOpus(std::vector<uint8_t>& opus) {
return false;
}

View File

@ -1,13 +1,13 @@
#ifndef WAKE_WORD_DETECT_H
#define WAKE_WORD_DETECT_H
#ifndef ESP_WAKE_WORD_H
#define ESP_WAKE_WORD_H
#include <freertos/FreeRTOS.h>
#include <freertos/task.h>
#include <freertos/event_groups.h>
#include "model_path.h"
#include "esp_wn_iface.h"
#include "esp_wn_models.h"
#include <esp_wn_iface.h>
#include <esp_wn_models.h>
#include <model_path.h>
#include <list>
#include <string>
@ -17,19 +17,23 @@
#include <condition_variable>
#include "audio_codec.h"
#include <model_path.h>
#include "wake_word.h"
class WakeWordDetect {
class EspWakeWord : public WakeWord {
public:
WakeWordDetect();
~WakeWordDetect();
EspWakeWord();
~EspWakeWord();
void Initialize(AudioCodec* codec);
void Feed(const std::vector<int16_t>& data);
void OnWakeWordDetected(std::function<void(const std::string& wake_word)> callback);
void StartDetection();
void StopDetection();
bool IsDetectionRunning();
size_t GetFeedSize();
void EncodeWakeWordData();
bool GetWakeWordOpus(std::vector<uint8_t>& opus);
const std::string& GetLastDetectedWakeWord() const { return last_detected_wake_word_; }
private:
esp_wn_iface_t *wakenet_iface_ = nullptr;
@ -37,6 +41,9 @@ private:
srmodel_list_t *wakenet_model_ = nullptr;
EventGroupHandle_t event_group_;
AudioCodec* codec_ = nullptr;
std::function<void(const std::string& wake_word)> wake_word_detected_callback_;
std::string last_detected_wake_word_;
};
#endif

View File

@ -1,13 +1,13 @@
#include "dummy_audio_processor.h"
#include "no_audio_processor.h"
#include <esp_log.h>
#define TAG "DummyAudioProcessor"
#define TAG "NoAudioProcessor"
void DummyAudioProcessor::Initialize(AudioCodec* codec) {
void NoAudioProcessor::Initialize(AudioCodec* codec) {
codec_ = codec;
}
void DummyAudioProcessor::Feed(const std::vector<int16_t>& data) {
void NoAudioProcessor::Feed(const std::vector<int16_t>& data) {
if (!is_running_ || !output_callback_) {
return;
}
@ -15,27 +15,27 @@ void DummyAudioProcessor::Feed(const std::vector<int16_t>& data) {
output_callback_(std::vector<int16_t>(data));
}
void DummyAudioProcessor::Start() {
void NoAudioProcessor::Start() {
is_running_ = true;
}
void DummyAudioProcessor::Stop() {
void NoAudioProcessor::Stop() {
is_running_ = false;
}
bool DummyAudioProcessor::IsRunning() {
bool NoAudioProcessor::IsRunning() {
return is_running_;
}
void DummyAudioProcessor::OnOutput(std::function<void(std::vector<int16_t>&& data)> callback) {
void NoAudioProcessor::OnOutput(std::function<void(std::vector<int16_t>&& data)> callback) {
output_callback_ = callback;
}
void DummyAudioProcessor::OnVadStateChange(std::function<void(bool speaking)> callback) {
void NoAudioProcessor::OnVadStateChange(std::function<void(bool speaking)> callback) {
vad_state_change_callback_ = callback;
}
size_t DummyAudioProcessor::GetFeedSize() {
size_t NoAudioProcessor::GetFeedSize() {
if (!codec_) {
return 0;
}
@ -43,7 +43,7 @@ size_t DummyAudioProcessor::GetFeedSize() {
return 30 * codec_->input_sample_rate() / 1000;
}
void DummyAudioProcessor::EnableDeviceAec(bool enable) {
void NoAudioProcessor::EnableDeviceAec(bool enable) {
if (enable) {
ESP_LOGE(TAG, "Device AEC is not supported");
}

View File

@ -7,10 +7,10 @@
#include "audio_processor.h"
#include "audio_codec.h"
class DummyAudioProcessor : public AudioProcessor {
class NoAudioProcessor : public AudioProcessor {
public:
DummyAudioProcessor() = default;
~DummyAudioProcessor() = default;
NoAudioProcessor() = default;
~NoAudioProcessor() = default;
void Initialize(AudioCodec* codec) override;
void Feed(const std::vector<int16_t>& data) override;

View File

@ -0,0 +1,45 @@
#include "no_wake_word.h"
#include <esp_log.h>
#define TAG "NoWakeWord"
void NoWakeWord::Initialize(AudioCodec* codec) {
codec_ = codec;
}
void NoWakeWord::Feed(const std::vector<int16_t>& data) {
// Do nothing - no wake word processing
}
void NoWakeWord::OnWakeWordDetected(std::function<void(const std::string& wake_word)> callback) {
// Do nothing - no wake word processing
}
void NoWakeWord::StartDetection() {
// Do nothing - no wake word processing
}
void NoWakeWord::StopDetection() {
// Do nothing - no wake word processing
}
bool NoWakeWord::IsDetectionRunning() {
return false; // No wake word processing
}
size_t NoWakeWord::GetFeedSize() {
return 0; // No specific feed size requirement
}
void NoWakeWord::EncodeWakeWordData() {
// Do nothing - no encoding needed
}
bool NoWakeWord::GetWakeWordOpus(std::vector<uint8_t>& opus) {
opus.clear();
return false; // No opus data available
}
const std::string& NoWakeWord::GetLastDetectedWakeWord() const {
return ""; // No wake word detected
}

View File

@ -0,0 +1,31 @@
#ifndef NO_WAKE_WORD_H
#define NO_WAKE_WORD_H
#include <vector>
#include <functional>
#include <string>
#include "wake_word.h"
#include "audio_codec.h"
class NoWakeWord : public WakeWord {
public:
NoWakeWord() = default;
~NoWakeWord() = default;
void Initialize(AudioCodec* codec) override;
void Feed(const std::vector<int16_t>& data) override;
void OnWakeWordDetected(std::function<void(const std::string& wake_word)> callback) override;
void StartDetection() override;
void StopDetection() override;
bool IsDetectionRunning() override;
size_t GetFeedSize() override;
void EncodeWakeWordData() override;
bool GetWakeWordOpus(std::vector<uint8_t>& opus) override;
const std::string& GetLastDetectedWakeWord() const override;
private:
AudioCodec* codec_ = nullptr;
};
#endif

View File

@ -0,0 +1,26 @@
#ifndef WAKE_WORD_H
#define WAKE_WORD_H
#include <string>
#include <vector>
#include <functional>
#include "audio_codec.h"
class WakeWord {
public:
virtual ~WakeWord() = default;
virtual void Initialize(AudioCodec* codec) = 0;
virtual void Feed(const std::vector<int16_t>& data) = 0;
virtual void OnWakeWordDetected(std::function<void(const std::string& wake_word)> callback) = 0;
virtual void StartDetection() = 0;
virtual void StopDetection() = 0;
virtual bool IsDetectionRunning() = 0;
virtual size_t GetFeedSize() = 0;
virtual void EncodeWakeWordData() = 0;
virtual bool GetWakeWordOpus(std::vector<uint8_t>& opus) = 0;
virtual const std::string& GetLastDetectedWakeWord() const = 0;
};
#endif

View File

@ -30,7 +30,8 @@
"CONFIG_MBEDTLS_DYNAMIC_FREE_CONFIG_DATA=y",
"CONFIG_NEWLIB_NANO_FORMAT=y",
"CONFIG_MMAP_FILE_NAME_LENGTH=25",
"CONFIG_ESP_CONSOLE_NONE=y"
"CONFIG_ESP_CONSOLE_NONE=y",
"CONFIG_IOT_PROTOCOL_XIAOZHI=y"
]
}
]

View File

@ -70,7 +70,7 @@ private:
}
void InitializePowerSaveTimer() {
power_save_timer_ = new PowerSaveTimer(160, 60);
power_save_timer_ = new PowerSaveTimer(240, 60);
power_save_timer_->OnEnterSleepMode([this]() {
ESP_LOGI(TAG, "Enabling sleep mode");
auto display = GetDisplay();

View File

@ -5,7 +5,9 @@
"name": "lichuang-c3-dev",
"sdkconfig_append": [
"CONFIG_ESPTOOLPY_FLASHSIZE_8MB=y",
"CONFIG_PARTITION_TABLE_CUSTOM_FILENAME=\"partitions_8M.csv\""
"CONFIG_PARTITION_TABLE_CUSTOM_FILENAME=\"partitions_8M.csv\"",
"CONFIG_ESP_WIFI_ENTERPRISE_SUPPORT=n",
"CONFIG_LWIP_IPV6=n"
]
}
]

View File

@ -5,7 +5,8 @@
"name": "magiclick-c3-v2",
"sdkconfig_append": [
"CONFIG_PM_ENABLE=y",
"CONFIG_FREERTOS_USE_TICKLESS_IDLE=y"
"CONFIG_FREERTOS_USE_TICKLESS_IDLE=y",
"CONFIG_USE_ESP_WAKE_WORD=n"
]
}
]

View File

@ -5,7 +5,8 @@
"name": "magiclick-c3",
"sdkconfig_append": [
"CONFIG_PM_ENABLE=y",
"CONFIG_FREERTOS_USE_TICKLESS_IDLE=y"
"CONFIG_FREERTOS_USE_TICKLESS_IDLE=y",
"CONFIG_USE_ESP_WAKE_WORD=n"
]
}
]

View File

@ -5,7 +5,8 @@
"name": "xmini-c3",
"sdkconfig_append": [
"CONFIG_PM_ENABLE=y",
"CONFIG_FREERTOS_USE_TICKLESS_IDLE=y"
"CONFIG_FREERTOS_USE_TICKLESS_IDLE=y",
"CONFIG_USE_ESP_WAKE_WORD=y"
]
}
]

View File

@ -30,10 +30,10 @@ private:
Display* display_ = nullptr;
Button boot_button_;
bool press_to_talk_enabled_ = false;
PowerSaveTimer* power_save_timer_;
PowerSaveTimer* power_save_timer_ = nullptr;
void InitializePowerSaveTimer() {
power_save_timer_ = new PowerSaveTimer(160, 60);
power_save_timer_ = new PowerSaveTimer(160, 600);
power_save_timer_->OnEnterSleepMode([this]() {
ESP_LOGI(TAG, "Enabling sleep mode");
auto display = GetDisplay();
@ -130,7 +130,9 @@ private:
}
});
boot_button_.OnPressDown([this]() {
power_save_timer_->WakeUp();
if (power_save_timer_) {
power_save_timer_->WakeUp();
}
if (press_to_talk_enabled_) {
Application::GetInstance().StartListening();
}

View File

@ -227,6 +227,8 @@ bool MqttProtocol::OpenAudioChannel() {
auto nonce = (uint8_t*)data.data();
auto encrypted = (uint8_t*)data.data() + aes_nonce_.size();
AudioStreamPacket packet;
packet.sample_rate = server_sample_rate_;
packet.frame_duration = server_frame_duration_;
packet.timestamp = timestamp;
packet.payload.resize(decrypted_size);
int ret = mbedtls_aes_crypt_ctr(&aes_ctx_, decrypted_size, &nc_off, nonce, stream_block, encrypted, (uint8_t*)packet.payload.data());

View File

@ -8,6 +8,8 @@
#include <vector>
struct AudioStreamPacket {
int sample_rate = 0;
int frame_duration = 0;
uint32_t timestamp = 0;
std::vector<uint8_t> payload;
};

View File

@ -124,6 +124,8 @@ bool WebsocketProtocol::OpenAudioChannel() {
bp2->payload_size = ntohl(bp2->payload_size);
auto payload = (uint8_t*)bp2->payload;
on_incoming_audio_(AudioStreamPacket{
.sample_rate = server_sample_rate_,
.frame_duration = server_frame_duration_,
.timestamp = bp2->timestamp,
.payload = std::vector<uint8_t>(payload, payload + bp2->payload_size)
});
@ -133,11 +135,15 @@ bool WebsocketProtocol::OpenAudioChannel() {
bp3->payload_size = ntohs(bp3->payload_size);
auto payload = (uint8_t*)bp3->payload;
on_incoming_audio_(AudioStreamPacket{
.sample_rate = server_sample_rate_,
.frame_duration = server_frame_duration_,
.timestamp = 0,
.payload = std::vector<uint8_t>(payload, payload + bp3->payload_size)
});
} else {
on_incoming_audio_(AudioStreamPacket{
.sample_rate = server_sample_rate_,
.frame_duration = server_frame_duration_,
.timestamp = 0,
.payload = std::vector<uint8_t>((uint8_t*)data, (uint8_t*)data + len)
});

View File

@ -1,2 +1,3 @@
CONFIG_ESPTOOLPY_FLASHSIZE_16MB=y
CONFIG_SR_WN_WN9S_NIHAOXIAOZHI=y