forked from xiaozhi/xiaozhi-esp32
send wake word audio to the server
This commit is contained in:
@@ -40,6 +40,16 @@ Application::~Application() {
|
|||||||
esp_afe_vc_v1.destroy(afe_communication_data_);
|
esp_afe_vc_v1.destroy(afe_communication_data_);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (wake_word_encode_task_stack_ != nullptr) {
|
||||||
|
free(wake_word_encode_task_stack_);
|
||||||
|
}
|
||||||
|
for (auto& pcm : wake_word_pcm_) {
|
||||||
|
free(pcm.iov_base);
|
||||||
|
}
|
||||||
|
for (auto& opus : wake_word_opus_) {
|
||||||
|
free(opus.iov_base);
|
||||||
|
}
|
||||||
|
|
||||||
if (opus_decoder_ != nullptr) {
|
if (opus_decoder_ != nullptr) {
|
||||||
opus_decoder_destroy(opus_decoder_);
|
opus_decoder_destroy(opus_decoder_);
|
||||||
}
|
}
|
||||||
@@ -56,6 +66,7 @@ Application::~Application() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void Application::Start() {
|
void Application::Start() {
|
||||||
|
// Initialize the audio device
|
||||||
audio_device_.Start(CONFIG_AUDIO_INPUT_SAMPLE_RATE, CONFIG_AUDIO_OUTPUT_SAMPLE_RATE);
|
audio_device_.Start(CONFIG_AUDIO_INPUT_SAMPLE_RATE, CONFIG_AUDIO_OUTPUT_SAMPLE_RATE);
|
||||||
audio_device_.OnStateChanged([this]() {
|
audio_device_.OnStateChanged([this]() {
|
||||||
if (audio_device_.playing()) {
|
if (audio_device_.playing()) {
|
||||||
@@ -83,11 +94,17 @@ void Application::Start() {
|
|||||||
app->AudioDecodeTask();
|
app->AudioDecodeTask();
|
||||||
}, "opus_decode", opus_stack_size, this, 1, audio_decode_task_stack_, &audio_decode_task_buffer_);
|
}, "opus_decode", opus_stack_size, this, 1, audio_decode_task_stack_, &audio_decode_task_buffer_);
|
||||||
|
|
||||||
|
// Blink the LED to indicate the device is connecting
|
||||||
|
builtin_led_.SetBlue();
|
||||||
|
builtin_led_.BlinkOnce();
|
||||||
wifi_station_.Start();
|
wifi_station_.Start();
|
||||||
|
|
||||||
StartCommunication();
|
StartCommunication();
|
||||||
StartDetection();
|
StartDetection();
|
||||||
|
|
||||||
|
// Blink the LED to indicate the device is running
|
||||||
|
builtin_led_.SetGreen();
|
||||||
|
builtin_led_.BlinkOnce();
|
||||||
xEventGroupSetBits(event_group_, DETECTION_RUNNING);
|
xEventGroupSetBits(event_group_, DETECTION_RUNNING);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -113,13 +130,19 @@ void Application::SetChatState(ChatState state) {
|
|||||||
builtin_led_.SetGreen();
|
builtin_led_.SetGreen();
|
||||||
builtin_led_.TurnOn();
|
builtin_led_.TurnOn();
|
||||||
break;
|
break;
|
||||||
|
case kChatStateWakeWordDetected:
|
||||||
|
ESP_LOGI(TAG, "Chat state: wake word detected");
|
||||||
|
builtin_led_.SetBlue();
|
||||||
|
builtin_led_.TurnOn();
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const char* state_str[] = { "idle", "connecting", "listening", "speaking", "wake_word_detected", "unknown" };
|
||||||
std::lock_guard<std::recursive_mutex> lock(mutex_);
|
std::lock_guard<std::recursive_mutex> lock(mutex_);
|
||||||
if (ws_client_ && ws_client_->IsConnected()) {
|
if (ws_client_ && ws_client_->IsConnected()) {
|
||||||
cJSON* root = cJSON_CreateObject();
|
cJSON* root = cJSON_CreateObject();
|
||||||
cJSON_AddStringToObject(root, "type", "state");
|
cJSON_AddStringToObject(root, "type", "state");
|
||||||
cJSON_AddStringToObject(root, "state", chat_state_ == kChatStateListening ? "listening" : "speaking");
|
cJSON_AddStringToObject(root, "state", state_str[chat_state_]);
|
||||||
char* json = cJSON_PrintUnformatted(root);
|
char* json = cJSON_PrintUnformatted(root);
|
||||||
ws_client_->Send(json);
|
ws_client_->Send(json);
|
||||||
cJSON_Delete(root);
|
cJSON_Delete(root);
|
||||||
@@ -232,6 +255,55 @@ void Application::AudioFeedTask() {
|
|||||||
vTaskDelete(NULL);
|
vTaskDelete(NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void Application::StoreWakeWordData(uint8_t* data, size_t size) {
|
||||||
|
// store audio data to detect_packets_
|
||||||
|
auto iov = (iovec){
|
||||||
|
.iov_base = heap_caps_malloc(size, MALLOC_CAP_SPIRAM),
|
||||||
|
.iov_len = size
|
||||||
|
};
|
||||||
|
memcpy(iov.iov_base, data, size);
|
||||||
|
wake_word_pcm_.push_back(iov);
|
||||||
|
// remove the oldest packet if the size is larger than 50, about 2 seconds
|
||||||
|
if (wake_word_pcm_.size() > 50) {
|
||||||
|
heap_caps_free(wake_word_pcm_.front().iov_base);
|
||||||
|
wake_word_pcm_.pop_front();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void Application::EncodeWakeWordData() {
|
||||||
|
wake_word_opus_.clear();
|
||||||
|
if (wake_word_encode_task_stack_ == nullptr) {
|
||||||
|
wake_word_encode_task_stack_ = (StackType_t*)malloc(4096 * 8);
|
||||||
|
}
|
||||||
|
wake_word_encode_task_ = xTaskCreateStatic([](void* arg) {
|
||||||
|
Application* app = (Application*)arg;
|
||||||
|
// encode detect packets
|
||||||
|
for (auto& pcm : app->wake_word_pcm_) {
|
||||||
|
app->opus_encoder_.Encode(pcm, [app](const iovec opus) {
|
||||||
|
// append the opus data to the packet
|
||||||
|
iovec iov = {
|
||||||
|
.iov_base = heap_caps_malloc(opus.iov_len, MALLOC_CAP_SPIRAM),
|
||||||
|
.iov_len = opus.iov_len
|
||||||
|
};
|
||||||
|
memcpy(iov.iov_base, opus.iov_base, opus.iov_len);
|
||||||
|
app->wake_word_opus_.push_back(iov);
|
||||||
|
});
|
||||||
|
free(pcm.iov_base);
|
||||||
|
}
|
||||||
|
app->wake_word_pcm_.clear();
|
||||||
|
xEventGroupSetBits(app->event_group_, DETECT_PACKETS_ENCODED);
|
||||||
|
vTaskDelete(NULL);
|
||||||
|
}, "encode_detect_packets", 4096 * 8, this, 1, wake_word_encode_task_stack_, &wake_word_encode_task_buffer_);
|
||||||
|
}
|
||||||
|
|
||||||
|
void Application::SendWakeWordData() {
|
||||||
|
for (auto& opus: wake_word_opus_) {
|
||||||
|
ws_client_->Send(opus.iov_base, opus.iov_len, true);
|
||||||
|
heap_caps_free(opus.iov_base);
|
||||||
|
}
|
||||||
|
wake_word_opus_.clear();
|
||||||
|
}
|
||||||
|
|
||||||
void Application::AudioDetectionTask() {
|
void Application::AudioDetectionTask() {
|
||||||
auto chunk_size = esp_afe_sr_v1.get_fetch_chunksize(afe_detection_data_);
|
auto chunk_size = esp_afe_sr_v1.get_fetch_chunksize(afe_detection_data_);
|
||||||
ESP_LOGI(TAG, "Audio detection task started, chunk size: %d", chunk_size);
|
ESP_LOGI(TAG, "Audio detection task started, chunk size: %d", chunk_size);
|
||||||
@@ -248,17 +320,31 @@ void Application::AudioDetectionTask() {
|
|||||||
continue;;
|
continue;;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (res->wakeup_state == WAKENET_DETECTED) {
|
// Store the wake word data for voice recognition, like who is speaking
|
||||||
ESP_LOGI(TAG, "Wakenet detected");
|
StoreWakeWordData((uint8_t*)res->data, res->data_size);
|
||||||
|
|
||||||
|
if (res->wakeup_state == WAKENET_DETECTED) {
|
||||||
xEventGroupClearBits(event_group_, DETECTION_RUNNING);
|
xEventGroupClearBits(event_group_, DETECTION_RUNNING);
|
||||||
SetChatState(kChatStateConnecting);
|
SetChatState(kChatStateConnecting);
|
||||||
|
|
||||||
|
// Encode the wake word data and start websocket client at the same time
|
||||||
|
// They both consume a lot of time (700ms), so we can do them in parallel
|
||||||
|
EncodeWakeWordData();
|
||||||
StartWebSocketClient();
|
StartWebSocketClient();
|
||||||
|
|
||||||
|
// Here the websocket is done, and we also wait for the wake word data to be encoded
|
||||||
|
xEventGroupWaitBits(event_group_, DETECT_PACKETS_ENCODED, pdFALSE, pdTRUE, portMAX_DELAY);
|
||||||
|
|
||||||
std::lock_guard<std::recursive_mutex> lock(mutex_);
|
std::lock_guard<std::recursive_mutex> lock(mutex_);
|
||||||
if (ws_client_ && ws_client_->IsConnected()) {
|
if (ws_client_ && ws_client_->IsConnected()) {
|
||||||
|
// Send the wake word data to the server
|
||||||
|
SendWakeWordData();
|
||||||
|
// Send a ready message to indicate the server that the wake word data is sent
|
||||||
|
SetChatState(kChatStateWakeWordDetected);
|
||||||
// If connected, the hello message is already sent, so we can start communication
|
// If connected, the hello message is already sent, so we can start communication
|
||||||
xEventGroupSetBits(event_group_, COMMUNICATION_RUNNING);
|
xEventGroupSetBits(event_group_, COMMUNICATION_RUNNING);
|
||||||
|
|
||||||
|
ESP_LOGI(TAG, "Start communication after wake word detected");
|
||||||
} else {
|
} else {
|
||||||
SetChatState(kChatStateIdle);
|
SetChatState(kChatStateIdle);
|
||||||
xEventGroupSetBits(event_group_, DETECTION_RUNNING);
|
xEventGroupSetBits(event_group_, DETECTION_RUNNING);
|
||||||
@@ -319,6 +405,11 @@ void Application::AudioEncodeTask() {
|
|||||||
iovec pcm;
|
iovec pcm;
|
||||||
xQueueReceive(audio_encode_queue_, &pcm, portMAX_DELAY);
|
xQueueReceive(audio_encode_queue_, &pcm, portMAX_DELAY);
|
||||||
|
|
||||||
|
if (pcm.iov_len == 0) {
|
||||||
|
ESP_LOGE(TAG, "Empty audio data");
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
// Encode audio data
|
// Encode audio data
|
||||||
opus_encoder_.Encode(pcm, [this](const iovec opus) {
|
opus_encoder_.Encode(pcm, [this](const iovec opus) {
|
||||||
std::lock_guard<std::recursive_mutex> lock(mutex_);
|
std::lock_guard<std::recursive_mutex> lock(mutex_);
|
||||||
|
|||||||
@@ -15,9 +15,11 @@
|
|||||||
#include "esp_afe_sr_models.h"
|
#include "esp_afe_sr_models.h"
|
||||||
#include "esp_nsn_models.h"
|
#include "esp_nsn_models.h"
|
||||||
#include <mutex>
|
#include <mutex>
|
||||||
|
#include <list>
|
||||||
|
|
||||||
#define DETECTION_RUNNING 1
|
#define DETECTION_RUNNING 1
|
||||||
#define COMMUNICATION_RUNNING 2
|
#define COMMUNICATION_RUNNING 2
|
||||||
|
#define DETECT_PACKETS_ENCODED 4
|
||||||
|
|
||||||
|
|
||||||
enum ChatState {
|
enum ChatState {
|
||||||
@@ -25,6 +27,7 @@ enum ChatState {
|
|||||||
kChatStateConnecting,
|
kChatStateConnecting,
|
||||||
kChatStateListening,
|
kChatStateListening,
|
||||||
kChatStateSpeaking,
|
kChatStateSpeaking,
|
||||||
|
kChatStateWakeWordDetected
|
||||||
};
|
};
|
||||||
|
|
||||||
class Application {
|
class Application {
|
||||||
@@ -65,11 +68,20 @@ private:
|
|||||||
int opus_decode_sample_rate_ = CONFIG_AUDIO_OUTPUT_SAMPLE_RATE;
|
int opus_decode_sample_rate_ = CONFIG_AUDIO_OUTPUT_SAMPLE_RATE;
|
||||||
silk_resampler_state_struct resampler_state_;
|
silk_resampler_state_struct resampler_state_;
|
||||||
|
|
||||||
|
TaskHandle_t wake_word_encode_task_ = nullptr;
|
||||||
|
StaticTask_t wake_word_encode_task_buffer_;
|
||||||
|
StackType_t* wake_word_encode_task_stack_ = nullptr;
|
||||||
|
std::list<iovec> wake_word_pcm_;
|
||||||
|
std::vector<iovec> wake_word_opus_;
|
||||||
|
|
||||||
void SetDecodeSampleRate(int sample_rate);
|
void SetDecodeSampleRate(int sample_rate);
|
||||||
void SetChatState(ChatState state);
|
void SetChatState(ChatState state);
|
||||||
void StartDetection();
|
void StartDetection();
|
||||||
void StartCommunication();
|
void StartCommunication();
|
||||||
void StartWebSocketClient();
|
void StartWebSocketClient();
|
||||||
|
void StoreWakeWordData(uint8_t* data, size_t size);
|
||||||
|
void EncodeWakeWordData();
|
||||||
|
void SendWakeWordData();
|
||||||
|
|
||||||
void AudioFeedTask();
|
void AudioFeedTask();
|
||||||
void AudioDetectionTask();
|
void AudioDetectionTask();
|
||||||
|
|||||||
@@ -83,7 +83,7 @@ void BuiltinLed::Blink(int times, int interval_ms) {
|
|||||||
delete args;
|
delete args;
|
||||||
this_->blink_task_ = nullptr;
|
this_->blink_task_ = nullptr;
|
||||||
vTaskDelete(NULL);
|
vTaskDelete(NULL);
|
||||||
}, "blink", 1024, args, tskIDLE_PRIORITY, &blink_task_);
|
}, "blink", 4096, args, tskIDLE_PRIORITY, &blink_task_);
|
||||||
|
|
||||||
xSemaphoreGive(mutex_);
|
xSemaphoreGive(mutex_);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -58,8 +58,8 @@ extern "C" void app_main(void)
|
|||||||
|
|
||||||
// Dump CPU usage every 1 second
|
// Dump CPU usage every 1 second
|
||||||
while (true) {
|
while (true) {
|
||||||
vTaskDelay(2000 / portTICK_PERIOD_MS);
|
vTaskDelay(5000 / portTICK_PERIOD_MS);
|
||||||
SystemInfo::PrintRealTimeStats(STATS_TICKS);
|
// SystemInfo::PrintRealTimeStats(STATS_TICKS);
|
||||||
int free_sram = heap_caps_get_minimum_free_size(MALLOC_CAP_INTERNAL);
|
int free_sram = heap_caps_get_minimum_free_size(MALLOC_CAP_INTERNAL);
|
||||||
ESP_LOGI(TAG, "Free heap size: %u minimal internal: %u", SystemInfo::GetFreeHeapSize(), free_sram);
|
ESP_LOGI(TAG, "Free heap size: %u minimal internal: %u", SystemInfo::GetFreeHeapSize(), free_sram);
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user