Add dummy audio processor

This commit is contained in:
Terrence
2025-04-29 18:17:08 +08:00
parent c201e5955f
commit f6334246ca
8 changed files with 171 additions and 93 deletions

View File

@@ -157,7 +157,9 @@ file(GLOB BOARD_SOURCES
list(APPEND SOURCES ${BOARD_SOURCES}) list(APPEND SOURCES ${BOARD_SOURCES})
if(CONFIG_USE_AUDIO_PROCESSOR) if(CONFIG_USE_AUDIO_PROCESSOR)
list(APPEND SOURCES "audio_processing/audio_processor.cc") list(APPEND SOURCES "audio_processing/afe_audio_processor.cc")
else()
list(APPEND SOURCES "audio_processing/dummy_audio_processor.cc")
endif() endif()
if(CONFIG_USE_WAKE_WORD_DETECT) if(CONFIG_USE_WAKE_WORD_DETECT)
list(APPEND SOURCES "audio_processing/wake_word_detect.cc") list(APPEND SOURCES "audio_processing/wake_word_detect.cc")

View File

@@ -10,6 +10,12 @@
#include "iot/thing_manager.h" #include "iot/thing_manager.h"
#include "assets/lang_config.h" #include "assets/lang_config.h"
#if CONFIG_USE_AUDIO_PROCESSOR
#include "afe_audio_processor.h"
#else
#include "dummy_audio_processor.h"
#endif
#include <cstring> #include <cstring>
#include <esp_log.h> #include <esp_log.h>
#include <cJSON.h> #include <cJSON.h>
@@ -37,6 +43,12 @@ Application::Application() {
event_group_ = xEventGroupCreate(); event_group_ = xEventGroupCreate();
background_task_ = new BackgroundTask(4096 * 8); background_task_ = new BackgroundTask(4096 * 8);
#if CONFIG_USE_AUDIO_PROCESSOR
audio_processor_ = std::make_unique<AfeAudioProcessor>();
#else
audio_processor_ = std::make_unique<DummyAudioProcessor>();
#endif
esp_timer_create_args_t clock_timer_args = { esp_timer_create_args_t clock_timer_args = {
.callback = [](void* arg) { .callback = [](void* arg) {
Application* app = (Application*)arg; Application* app = (Application*)arg;
@@ -502,9 +514,8 @@ void Application::Start() {
}); });
bool protocol_started = protocol_->Start(); bool protocol_started = protocol_->Start();
#if CONFIG_USE_AUDIO_PROCESSOR audio_processor_->Initialize(codec, realtime_chat_enabled_);
audio_processor_.Initialize(codec, realtime_chat_enabled_); audio_processor_->OnOutput([this](std::vector<int16_t>&& data) {
audio_processor_.OnOutput([this](std::vector<int16_t>&& data) {
background_task_->Schedule([this, data = std::move(data)]() mutable { background_task_->Schedule([this, data = std::move(data)]() mutable {
if (protocol_->IsAudioChannelBusy()) { if (protocol_->IsAudioChannelBusy()) {
return; return;
@@ -520,7 +531,7 @@ void Application::Start() {
}); });
}); });
}); });
audio_processor_.OnVadStateChange([this](bool speaking) { audio_processor_->OnVadStateChange([this](bool speaking) {
if (device_state_ == kDeviceStateListening) { if (device_state_ == kDeviceStateListening) {
Schedule([this, speaking]() { Schedule([this, speaking]() {
if (speaking) { if (speaking) {
@@ -533,7 +544,6 @@ void Application::Start() {
}); });
} }
}); });
#endif
#if CONFIG_USE_WAKE_WORD_DETECT #if CONFIG_USE_WAKE_WORD_DETECT
wake_word_detect_.Initialize(codec); wake_word_detect_.Initialize(codec);
@@ -716,37 +726,16 @@ void Application::OnAudioInput() {
} }
} }
#endif #endif
#if CONFIG_USE_AUDIO_PROCESSOR if (audio_processor_->IsRunning()) {
if (audio_processor_.IsRunning()) {
std::vector<int16_t> data; std::vector<int16_t> data;
int samples = audio_processor_.GetFeedSize(); int samples = audio_processor_->GetFeedSize();
if (samples > 0) { if (samples > 0) {
ReadAudio(data, 16000, samples); ReadAudio(data, 16000, samples);
audio_processor_.Feed(data); audio_processor_->Feed(data);
return; return;
} }
} }
#else
if (device_state_ == kDeviceStateListening) {
std::vector<int16_t> data;
ReadAudio(data, 16000, 30 * 16000 / 1000);
background_task_->Schedule([this, data = std::move(data)]() mutable {
if (protocol_->IsAudioChannelBusy()) {
return;
}
opus_encoder_->Encode(std::move(data), [this](std::vector<uint8_t>&& opus) {
AudioStreamPacket packet;
packet.payload = std::move(opus);
packet.timestamp = last_output_timestamp_;
last_output_timestamp_ = 0;
Schedule([this, packet = std::move(packet)]() {
protocol_->SendAudio(packet);
});
});
});
return;
}
#endif
vTaskDelay(pdMS_TO_TICKS(30)); vTaskDelay(pdMS_TO_TICKS(30));
} }
@@ -818,9 +807,7 @@ void Application::SetDeviceState(DeviceState state) {
case kDeviceStateIdle: case kDeviceStateIdle:
display->SetStatus(Lang::Strings::STANDBY); display->SetStatus(Lang::Strings::STANDBY);
display->SetEmotion("neutral"); display->SetEmotion("neutral");
#if CONFIG_USE_AUDIO_PROCESSOR audio_processor_->Stop();
audio_processor_.Stop();
#endif
#if CONFIG_USE_WAKE_WORD_DETECT #if CONFIG_USE_WAKE_WORD_DETECT
wake_word_detect_.StartDetection(); wake_word_detect_.StartDetection();
#endif #endif
@@ -838,11 +825,7 @@ void Application::SetDeviceState(DeviceState state) {
UpdateIotStates(); UpdateIotStates();
// Make sure the audio processor is running // Make sure the audio processor is running
#if CONFIG_USE_AUDIO_PROCESSOR if (!audio_processor_->IsRunning()) {
if (!audio_processor_.IsRunning()) {
#else
if (true) {
#endif
// Send the start listening command // Send the start listening command
protocol_->SendStartListening(listening_mode_); protocol_->SendStartListening(listening_mode_);
if (listening_mode_ == kListeningModeAutoStop && previous_state == kDeviceStateSpeaking) { if (listening_mode_ == kListeningModeAutoStop && previous_state == kDeviceStateSpeaking) {
@@ -853,18 +836,14 @@ void Application::SetDeviceState(DeviceState state) {
#if CONFIG_USE_WAKE_WORD_DETECT #if CONFIG_USE_WAKE_WORD_DETECT
wake_word_detect_.StopDetection(); wake_word_detect_.StopDetection();
#endif #endif
#if CONFIG_USE_AUDIO_PROCESSOR audio_processor_->Start();
audio_processor_.Start();
#endif
} }
break; break;
case kDeviceStateSpeaking: case kDeviceStateSpeaking:
display->SetStatus(Lang::Strings::SPEAKING); display->SetStatus(Lang::Strings::SPEAKING);
if (listening_mode_ != kListeningModeRealtime) { if (listening_mode_ != kListeningModeRealtime) {
#if CONFIG_USE_AUDIO_PROCESSOR audio_processor_->Stop();
audio_processor_.Stop();
#endif
#if CONFIG_USE_WAKE_WORD_DETECT #if CONFIG_USE_WAKE_WORD_DETECT
wake_word_detect_.StartDetection(); wake_word_detect_.StartDetection();
#endif #endif

View File

@@ -11,6 +11,7 @@
#include <list> #include <list>
#include <vector> #include <vector>
#include <condition_variable> #include <condition_variable>
#include <memory>
#include <opus_encoder.h> #include <opus_encoder.h>
#include <opus_decoder.h> #include <opus_decoder.h>
@@ -19,13 +20,11 @@
#include "protocol.h" #include "protocol.h"
#include "ota.h" #include "ota.h"
#include "background_task.h" #include "background_task.h"
#include "audio_processor.h"
#if CONFIG_USE_WAKE_WORD_DETECT #if CONFIG_USE_WAKE_WORD_DETECT
#include "wake_word_detect.h" #include "wake_word_detect.h"
#endif #endif
#if CONFIG_USE_AUDIO_PROCESSOR
#include "audio_processor.h"
#endif
#define SCHEDULE_EVENT (1 << 0) #define SCHEDULE_EVENT (1 << 0)
#define AUDIO_INPUT_READY_EVENT (1 << 1) #define AUDIO_INPUT_READY_EVENT (1 << 1)
@@ -81,9 +80,7 @@ private:
#if CONFIG_USE_WAKE_WORD_DETECT #if CONFIG_USE_WAKE_WORD_DETECT
WakeWordDetect wake_word_detect_; WakeWordDetect wake_word_detect_;
#endif #endif
#if CONFIG_USE_AUDIO_PROCESSOR std::unique_ptr<AudioProcessor> audio_processor_;
AudioProcessor audio_processor_;
#endif
Ota ota_; Ota ota_;
std::mutex mutex_; std::mutex mutex_;
std::list<std::function<void()>> main_tasks_; std::list<std::function<void()>> main_tasks_;

View File

@@ -1,16 +1,16 @@
#include "audio_processor.h" #include "afe_audio_processor.h"
#include <esp_log.h> #include <esp_log.h>
#define PROCESSOR_RUNNING 0x01 #define PROCESSOR_RUNNING 0x01
static const char* TAG = "AudioProcessor"; static const char* TAG = "AfeAudioProcessor";
AudioProcessor::AudioProcessor() AfeAudioProcessor::AfeAudioProcessor()
: afe_data_(nullptr) { : afe_data_(nullptr) {
event_group_ = xEventGroupCreate(); event_group_ = xEventGroupCreate();
} }
void AudioProcessor::Initialize(AudioCodec* codec, bool realtime_chat) { void AfeAudioProcessor::Initialize(AudioCodec* codec, bool realtime_chat) {
codec_ = codec; codec_ = codec;
int ref_num = codec_->input_reference() ? 1 : 0; int ref_num = codec_->input_reference() ? 1 : 0;
@@ -51,57 +51,57 @@ void AudioProcessor::Initialize(AudioCodec* codec, bool realtime_chat) {
afe_data_ = afe_iface_->create_from_config(afe_config); afe_data_ = afe_iface_->create_from_config(afe_config);
xTaskCreate([](void* arg) { xTaskCreate([](void* arg) {
auto this_ = (AudioProcessor*)arg; auto this_ = (AfeAudioProcessor*)arg;
this_->AudioProcessorTask(); this_->AudioProcessorTask();
vTaskDelete(NULL); vTaskDelete(NULL);
}, "audio_communication", 4096, this, 3, NULL); }, "audio_communication", 4096, this, 3, NULL);
} }
AudioProcessor::~AudioProcessor() { AfeAudioProcessor::~AfeAudioProcessor() {
if (afe_data_ != nullptr) { if (afe_data_ != nullptr) {
afe_iface_->destroy(afe_data_); afe_iface_->destroy(afe_data_);
} }
vEventGroupDelete(event_group_); vEventGroupDelete(event_group_);
} }
size_t AudioProcessor::GetFeedSize() { size_t AfeAudioProcessor::GetFeedSize() {
if (afe_data_ == nullptr) { if (afe_data_ == nullptr) {
return 0; return 0;
} }
return afe_iface_->get_feed_chunksize(afe_data_) * codec_->input_channels(); return afe_iface_->get_feed_chunksize(afe_data_) * codec_->input_channels();
} }
void AudioProcessor::Feed(const std::vector<int16_t>& data) { void AfeAudioProcessor::Feed(const std::vector<int16_t>& data) {
if (afe_data_ == nullptr) { if (afe_data_ == nullptr) {
return; return;
} }
afe_iface_->feed(afe_data_, data.data()); afe_iface_->feed(afe_data_, data.data());
} }
void AudioProcessor::Start() { void AfeAudioProcessor::Start() {
xEventGroupSetBits(event_group_, PROCESSOR_RUNNING); xEventGroupSetBits(event_group_, PROCESSOR_RUNNING);
} }
void AudioProcessor::Stop() { void AfeAudioProcessor::Stop() {
xEventGroupClearBits(event_group_, PROCESSOR_RUNNING); xEventGroupClearBits(event_group_, PROCESSOR_RUNNING);
if (afe_data_ != nullptr) { if (afe_data_ != nullptr) {
afe_iface_->reset_buffer(afe_data_); afe_iface_->reset_buffer(afe_data_);
} }
} }
bool AudioProcessor::IsRunning() { bool AfeAudioProcessor::IsRunning() {
return xEventGroupGetBits(event_group_) & PROCESSOR_RUNNING; return xEventGroupGetBits(event_group_) & PROCESSOR_RUNNING;
} }
void AudioProcessor::OnOutput(std::function<void(std::vector<int16_t>&& data)> callback) { void AfeAudioProcessor::OnOutput(std::function<void(std::vector<int16_t>&& data)> callback) {
output_callback_ = callback; output_callback_ = callback;
} }
void AudioProcessor::OnVadStateChange(std::function<void(bool speaking)> callback) { void AfeAudioProcessor::OnVadStateChange(std::function<void(bool speaking)> callback) {
vad_state_change_callback_ = callback; vad_state_change_callback_ = callback;
} }
void AudioProcessor::AudioProcessorTask() { void AfeAudioProcessor::AudioProcessorTask() {
auto fetch_size = afe_iface_->get_fetch_chunksize(afe_data_); auto fetch_size = afe_iface_->get_fetch_chunksize(afe_data_);
auto feed_size = afe_iface_->get_feed_chunksize(afe_data_); auto feed_size = afe_iface_->get_feed_chunksize(afe_data_);
ESP_LOGI(TAG, "Audio communication task started, feed size: %d fetch size: %d", ESP_LOGI(TAG, "Audio communication task started, feed size: %d fetch size: %d",

View File

@@ -0,0 +1,42 @@
#ifndef AFE_AUDIO_PROCESSOR_H
#define AFE_AUDIO_PROCESSOR_H
#include <esp_afe_sr_models.h>
#include <freertos/FreeRTOS.h>
#include <freertos/task.h>
#include <freertos/event_groups.h>
#include <string>
#include <vector>
#include <functional>
#include "audio_processor.h"
#include "audio_codec.h"
class AfeAudioProcessor : public AudioProcessor {
public:
AfeAudioProcessor();
~AfeAudioProcessor();
void Initialize(AudioCodec* codec, bool realtime_chat) override;
void Feed(const std::vector<int16_t>& data) override;
void Start() override;
void Stop() override;
bool IsRunning() override;
void OnOutput(std::function<void(std::vector<int16_t>&& data)> callback) override;
void OnVadStateChange(std::function<void(bool speaking)> callback) override;
size_t GetFeedSize() override;
private:
EventGroupHandle_t event_group_ = nullptr;
esp_afe_sr_iface_t* afe_iface_ = nullptr;
esp_afe_sr_data_t* afe_data_ = nullptr;
std::function<void(std::vector<int16_t>&& data)> output_callback_;
std::function<void(bool speaking)> vad_state_change_callback_;
AudioCodec* codec_ = nullptr;
bool is_speaking_ = false;
void AudioProcessorTask();
};
#endif

View File

@@ -1,11 +1,6 @@
#ifndef AUDIO_PROCESSOR_H #ifndef AUDIO_PROCESSOR_H
#define AUDIO_PROCESSOR_H #define AUDIO_PROCESSOR_H
#include <esp_afe_sr_models.h>
#include <freertos/FreeRTOS.h>
#include <freertos/task.h>
#include <freertos/event_groups.h>
#include <string> #include <string>
#include <vector> #include <vector>
#include <functional> #include <functional>
@@ -14,28 +9,16 @@
class AudioProcessor { class AudioProcessor {
public: public:
AudioProcessor(); virtual ~AudioProcessor() = default;
~AudioProcessor();
void Initialize(AudioCodec* codec, bool realtime_chat); virtual void Initialize(AudioCodec* codec, bool realtime_chat) = 0;
void Feed(const std::vector<int16_t>& data); virtual void Feed(const std::vector<int16_t>& data) = 0;
void Start(); virtual void Start() = 0;
void Stop(); virtual void Stop() = 0;
bool IsRunning(); virtual bool IsRunning() = 0;
void OnOutput(std::function<void(std::vector<int16_t>&& data)> callback); virtual void OnOutput(std::function<void(std::vector<int16_t>&& data)> callback) = 0;
void OnVadStateChange(std::function<void(bool speaking)> callback); virtual void OnVadStateChange(std::function<void(bool speaking)> callback) = 0;
size_t GetFeedSize(); virtual size_t GetFeedSize() = 0;
private:
EventGroupHandle_t event_group_ = nullptr;
esp_afe_sr_iface_t* afe_iface_ = nullptr;
esp_afe_sr_data_t* afe_data_ = nullptr;
std::function<void(std::vector<int16_t>&& data)> output_callback_;
std::function<void(bool speaking)> vad_state_change_callback_;
AudioCodec* codec_ = nullptr;
bool is_speaking_ = false;
void AudioProcessorTask();
}; };
#endif #endif

View File

@@ -0,0 +1,44 @@
#include "dummy_audio_processor.h"
#include <esp_log.h>
static const char* TAG = "DummyAudioProcessor";
void DummyAudioProcessor::Initialize(AudioCodec* codec, bool realtime_chat) {
codec_ = codec;
}
void DummyAudioProcessor::Feed(const std::vector<int16_t>& data) {
if (!is_running_ || !output_callback_) {
return;
}
// 直接将输入数据传递给输出回调
output_callback_(std::vector<int16_t>(data));
}
void DummyAudioProcessor::Start() {
is_running_ = true;
}
void DummyAudioProcessor::Stop() {
is_running_ = false;
}
bool DummyAudioProcessor::IsRunning() {
return is_running_;
}
void DummyAudioProcessor::OnOutput(std::function<void(std::vector<int16_t>&& data)> callback) {
output_callback_ = callback;
}
void DummyAudioProcessor::OnVadStateChange(std::function<void(bool speaking)> callback) {
vad_state_change_callback_ = callback;
}
size_t DummyAudioProcessor::GetFeedSize() {
if (!codec_) {
return 0;
}
// 返回一个固定的帧大小,比如 30ms 的数据
return 30 * codec_->input_sample_rate() / 1000;
}

View File

@@ -0,0 +1,31 @@
#ifndef DUMMY_AUDIO_PROCESSOR_H
#define DUMMY_AUDIO_PROCESSOR_H
#include <vector>
#include <functional>
#include "audio_processor.h"
#include "audio_codec.h"
class DummyAudioProcessor : public AudioProcessor {
public:
DummyAudioProcessor() = default;
~DummyAudioProcessor() = default;
void Initialize(AudioCodec* codec, bool realtime_chat) override;
void Feed(const std::vector<int16_t>& data) override;
void Start() override;
void Stop() override;
bool IsRunning() override;
void OnOutput(std::function<void(std::vector<int16_t>&& data)> callback) override;
void OnVadStateChange(std::function<void(bool speaking)> callback) override;
size_t GetFeedSize() override;
private:
AudioCodec* codec_ = nullptr;
std::function<void(std::vector<int16_t>&& data)> output_callback_;
std::function<void(bool speaking)> vad_state_change_callback_;
bool is_running_ = false;
};
#endif