Add wake word to xmini-c3 (#730)

* esp-hi: MCP protocol is not ready yet

* Add wake word to xmini-c3
This commit is contained in:
Xiaoxia
2025-05-31 22:21:03 +08:00
committed by GitHub
parent 6cb025859f
commit ae57131c15
27 changed files with 399 additions and 155 deletions

View File

@@ -14,7 +14,15 @@
#if CONFIG_USE_AUDIO_PROCESSOR
#include "afe_audio_processor.h"
#else
#include "dummy_audio_processor.h"
#include "no_audio_processor.h"
#endif
#if CONFIG_USE_AFE_WAKE_WORD
#include "afe_wake_word.h"
#elif CONFIG_USE_ESP_WAKE_WORD
#include "esp_wake_word.h"
#else
#include "no_wake_word.h"
#endif
#include <cstring>
@@ -55,7 +63,15 @@ Application::Application() {
#if CONFIG_USE_AUDIO_PROCESSOR
audio_processor_ = std::make_unique<AfeAudioProcessor>();
#else
audio_processor_ = std::make_unique<DummyAudioProcessor>();
audio_processor_ = std::make_unique<NoAudioProcessor>();
#endif
#if CONFIG_USE_AFE_WAKE_WORD
wake_word_ = std::make_unique<AfeWakeWord>();
#elif CONFIG_USE_ESP_WAKE_WORD
wake_word_ = std::make_unique<EspWakeWord>();
#else
wake_word_ = std::make_unique<NoWakeWord>();
#endif
esp_timer_create_args_t clock_timer_args = {
@@ -129,9 +145,7 @@ void Application::CheckNewVersion() {
auto& board = Board::GetInstance();
board.SetPowerSaveMode(false);
#if CONFIG_USE_WAKE_WORD_DETECT || CONFIG_USE_WAKE_WORD_DETECT_NO_AFE
wake_word_detect_.StopDetection();
#endif
wake_word_->StopDetection();
// 预先关闭音频输出,避免升级过程有音频操作
auto codec = board.GetAudioCodec();
codec->EnableInput(false);
@@ -256,8 +270,6 @@ void Application::PlaySound(const std::string_view& sound) {
}
background_task_->WaitForCompletion();
// The assets are encoded at 16000Hz, 60ms frame duration
SetDecodeSampleRate(16000, 60);
const char* data = sound.data();
size_t size = sound.size();
for (const char* p = data; p < data + size; ) {
@@ -266,6 +278,8 @@ void Application::PlaySound(const std::string_view& sound) {
auto payload_size = ntohs(p3->payload_size);
AudioStreamPacket packet;
packet.sample_rate = 16000;
packet.frame_duration = 60;
packet.payload.resize(payload_size);
memcpy(packet.payload.data(), p3->payload, payload_size);
p += payload_size;
@@ -432,7 +446,7 @@ void Application::Start() {
});
protocol_->OnIncomingAudio([this](AudioStreamPacket&& packet) {
std::lock_guard<std::mutex> lock(mutex_);
if (audio_decode_queue_.size() < MAX_AUDIO_PACKETS_IN_QUEUE) {
if (device_state_ == kDeviceStateSpeaking && audio_decode_queue_.size() < MAX_AUDIO_PACKETS_IN_QUEUE) {
audio_decode_queue_.emplace_back(std::move(packet));
}
});
@@ -442,7 +456,6 @@ void Application::Start() {
ESP_LOGW(TAG, "Server sample rate %d does not match device output sample rate %d, resampling may cause distortion",
protocol_->server_sample_rate(), codec->output_sample_rate());
}
SetDecodeSampleRate(protocol_->server_sample_rate(), protocol_->server_frame_duration());
#if CONFIG_IOT_PROTOCOL_XIAOZHI
auto& thing_manager = iot::ThingManager::GetInstance();
@@ -600,28 +613,40 @@ void Application::Start() {
}
});
#if CONFIG_USE_WAKE_WORD_DETECT || CONFIG_USE_WAKE_WORD_DETECT_NO_AFE
wake_word_detect_.Initialize(codec);
#ifdef CONFIG_USE_WAKE_WORD_DETECT
wake_word_detect_.OnWakeWordDetected([this](const std::string& wake_word) {
wake_word_->Initialize(codec);
wake_word_->OnWakeWordDetected([this](const std::string& wake_word) {
Schedule([this, &wake_word]() {
if (device_state_ == kDeviceStateIdle) {
SetDeviceState(kDeviceStateConnecting);
wake_word_detect_.EncodeWakeWordData();
if (!protocol_) {
return;
}
if (!protocol_ || !protocol_->OpenAudioChannel()) {
wake_word_detect_.StartDetection();
return;
if (device_state_ == kDeviceStateIdle) {
wake_word_->EncodeWakeWordData();
if (!protocol_->IsAudioChannelOpened()) {
SetDeviceState(kDeviceStateConnecting);
if (!protocol_->OpenAudioChannel()) {
wake_word_->StartDetection();
return;
}
}
ESP_LOGI(TAG, "Wake word detected: %s", wake_word.c_str());
#if CONFIG_USE_AFE_WAKE_WORD
AudioStreamPacket packet;
// Encode and send the wake word data to the server
while (wake_word_detect_.GetWakeWordOpus(packet.payload)) {
while (wake_word_->GetWakeWordOpus(packet.payload)) {
protocol_->SendAudio(packet);
}
// Set the chat state to wake word detected
protocol_->SendWakeWordDetected(wake_word);
ESP_LOGI(TAG, "Wake word detected: %s", wake_word.c_str());
#else
// Play the pop up sound to indicate the wake word is detected
// And wait 60ms to make sure the queue has been processed by audio task
ResetDecoder();
PlaySound(Lang::Sounds::P3_POPUP);
vTaskDelay(pdMS_TO_TICKS(60));
#endif
SetListeningMode(aec_mode_ == kAecOff ? kListeningModeAutoStop : kListeningModeRealtime);
} else if (device_state_ == kDeviceStateSpeaking) {
AbortSpeaking(kAbortReasonWakeWordDetected);
@@ -630,9 +655,7 @@ void Application::Start() {
}
});
});
#endif
wake_word_detect_.StartDetection();
#endif
wake_word_->StartDetection();
// Wait for the new version check to finish
xEventGroupWaitBits(event_group_, CHECK_NEW_VERSION_DONE_EVENT, pdTRUE, pdFALSE, portMAX_DELAY);
@@ -751,17 +774,14 @@ void Application::OnAudioOutput() {
return;
}
if (device_state_ == kDeviceStateListening) {
audio_decode_queue_.clear();
audio_decode_cv_.notify_all();
return;
}
auto packet = std::move(audio_decode_queue_.front());
audio_decode_queue_.pop_front();
lock.unlock();
audio_decode_cv_.notify_all();
// Synchronize the sample rate and frame duration
SetDecodeSampleRate(packet.sample_rate, packet.frame_duration);
busy_decoding_audio_ = true;
background_task_->Schedule([this, codec, packet = std::move(packet)]() mutable {
busy_decoding_audio_ = false;
@@ -782,45 +802,48 @@ void Application::OnAudioOutput() {
}
codec->OutputData(pcm);
#ifdef CONFIG_USE_SERVER_AEC
std::lock_guard<std::mutex> lock(timestamp_mutex_);
timestamp_queue_.push_back(packet.timestamp);
last_output_timestamp_ = packet.timestamp;
std::lock_guard<std::mutex> lock(timestamp_mutex_);
timestamp_queue_.push_back(packet.timestamp);
#endif
last_output_time_ = std::chrono::steady_clock::now();
});
}
void Application::OnAudioInput() {
#if CONFIG_USE_WAKE_WORD_DETECT || CONFIG_USE_WAKE_WORD_DETECT_NO_AFE
if (wake_word_detect_.IsDetectionRunning()) {
if (wake_word_->IsDetectionRunning()) {
std::vector<int16_t> data;
int samples = wake_word_detect_.GetFeedSize();
int samples = wake_word_->GetFeedSize();
if (samples > 0) {
ReadAudio(data, 16000, samples);
wake_word_detect_.Feed(data);
return;
if (ReadAudio(data, 16000, samples)) {
wake_word_->Feed(data);
return;
}
}
}
#endif
if (audio_processor_->IsRunning()) {
std::vector<int16_t> data;
int samples = audio_processor_->GetFeedSize();
if (samples > 0) {
ReadAudio(data, 16000, samples);
audio_processor_->Feed(data);
return;
if (ReadAudio(data, 16000, samples)) {
audio_processor_->Feed(data);
return;
}
}
}
vTaskDelay(pdMS_TO_TICKS(OPUS_FRAME_DURATION_MS / 2));
}
void Application::ReadAudio(std::vector<int16_t>& data, int sample_rate, int samples) {
bool Application::ReadAudio(std::vector<int16_t>& data, int sample_rate, int samples) {
auto codec = Board::GetInstance().GetAudioCodec();
if (!codec->input_enabled()) {
return false;
}
if (codec->input_sample_rate() != sample_rate) {
data.resize(samples * codec->input_sample_rate() / sample_rate);
if (!codec->InputData(data)) {
return;
return false;
}
if (codec->input_channels() == 2) {
auto mic_channel = std::vector<int16_t>(data.size() / 2);
@@ -846,9 +869,10 @@ void Application::ReadAudio(std::vector<int16_t>& data, int sample_rate, int sam
} else {
data.resize(samples);
if (!codec->InputData(data)) {
return;
return false;
}
}
return true;
}
void Application::AbortSpeaking(AbortReason reason) {
@@ -884,17 +908,13 @@ void Application::SetDeviceState(DeviceState state) {
display->SetStatus(Lang::Strings::STANDBY);
display->SetEmotion("neutral");
audio_processor_->Stop();
#if CONFIG_USE_WAKE_WORD_DETECT || CONFIG_USE_WAKE_WORD_DETECT_NO_AFE
wake_word_detect_.StartDetection();
#endif
wake_word_->StartDetection();
break;
case kDeviceStateConnecting:
display->SetStatus(Lang::Strings::CONNECTING);
display->SetEmotion("neutral");
display->SetChatMessage("system", "");
timestamp_queue_.clear();
last_output_timestamp_ = 0;
break;
case kDeviceStateListening:
display->SetStatus(Lang::Strings::LISTENING);
@@ -909,14 +929,14 @@ void Application::SetDeviceState(DeviceState state) {
// Send the start listening command
protocol_->SendStartListening(listening_mode_);
if (previous_state == kDeviceStateSpeaking) {
audio_decode_queue_.clear();
audio_decode_cv_.notify_all();
// FIXME: Wait for the speaker to empty the buffer
vTaskDelay(pdMS_TO_TICKS(120));
}
opus_encoder_->ResetState();
#if CONFIG_USE_WAKE_WORD_DETECT || CONFIG_USE_WAKE_WORD_DETECT_NO_AFE
wake_word_detect_.StopDetection();
#endif
audio_processor_->Start();
wake_word_->StopDetection();
}
break;
case kDeviceStateSpeaking:
@@ -924,8 +944,11 @@ void Application::SetDeviceState(DeviceState state) {
if (listening_mode_ != kListeningModeRealtime) {
audio_processor_->Stop();
#if CONFIG_USE_WAKE_WORD_DETECT || CONFIG_USE_WAKE_WORD_DETECT_NO_AFE
wake_word_detect_.StartDetection();
// Only AFE wake word can be detected in speaking mode
#if CONFIG_USE_AFE_WAKE_WORD
wake_word_->StartDetection();
#else
wake_word_->StopDetection();
#endif
}
ResetDecoder();