forked from xiaozhi/xiaozhi-esp32
Add dummy audio processor
This commit is contained in:
@@ -157,7 +157,9 @@ file(GLOB BOARD_SOURCES
|
||||
list(APPEND SOURCES ${BOARD_SOURCES})
|
||||
|
||||
if(CONFIG_USE_AUDIO_PROCESSOR)
|
||||
list(APPEND SOURCES "audio_processing/audio_processor.cc")
|
||||
list(APPEND SOURCES "audio_processing/afe_audio_processor.cc")
|
||||
else()
|
||||
list(APPEND SOURCES "audio_processing/dummy_audio_processor.cc")
|
||||
endif()
|
||||
if(CONFIG_USE_WAKE_WORD_DETECT)
|
||||
list(APPEND SOURCES "audio_processing/wake_word_detect.cc")
|
||||
|
||||
@@ -10,6 +10,12 @@
|
||||
#include "iot/thing_manager.h"
|
||||
#include "assets/lang_config.h"
|
||||
|
||||
#if CONFIG_USE_AUDIO_PROCESSOR
|
||||
#include "afe_audio_processor.h"
|
||||
#else
|
||||
#include "dummy_audio_processor.h"
|
||||
#endif
|
||||
|
||||
#include <cstring>
|
||||
#include <esp_log.h>
|
||||
#include <cJSON.h>
|
||||
@@ -37,6 +43,12 @@ Application::Application() {
|
||||
event_group_ = xEventGroupCreate();
|
||||
background_task_ = new BackgroundTask(4096 * 8);
|
||||
|
||||
#if CONFIG_USE_AUDIO_PROCESSOR
|
||||
audio_processor_ = std::make_unique<AfeAudioProcessor>();
|
||||
#else
|
||||
audio_processor_ = std::make_unique<DummyAudioProcessor>();
|
||||
#endif
|
||||
|
||||
esp_timer_create_args_t clock_timer_args = {
|
||||
.callback = [](void* arg) {
|
||||
Application* app = (Application*)arg;
|
||||
@@ -502,9 +514,8 @@ void Application::Start() {
|
||||
});
|
||||
bool protocol_started = protocol_->Start();
|
||||
|
||||
#if CONFIG_USE_AUDIO_PROCESSOR
|
||||
audio_processor_.Initialize(codec, realtime_chat_enabled_);
|
||||
audio_processor_.OnOutput([this](std::vector<int16_t>&& data) {
|
||||
audio_processor_->Initialize(codec, realtime_chat_enabled_);
|
||||
audio_processor_->OnOutput([this](std::vector<int16_t>&& data) {
|
||||
background_task_->Schedule([this, data = std::move(data)]() mutable {
|
||||
if (protocol_->IsAudioChannelBusy()) {
|
||||
return;
|
||||
@@ -520,7 +531,7 @@ void Application::Start() {
|
||||
});
|
||||
});
|
||||
});
|
||||
audio_processor_.OnVadStateChange([this](bool speaking) {
|
||||
audio_processor_->OnVadStateChange([this](bool speaking) {
|
||||
if (device_state_ == kDeviceStateListening) {
|
||||
Schedule([this, speaking]() {
|
||||
if (speaking) {
|
||||
@@ -533,7 +544,6 @@ void Application::Start() {
|
||||
});
|
||||
}
|
||||
});
|
||||
#endif
|
||||
|
||||
#if CONFIG_USE_WAKE_WORD_DETECT
|
||||
wake_word_detect_.Initialize(codec);
|
||||
@@ -716,37 +726,16 @@ void Application::OnAudioInput() {
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if CONFIG_USE_AUDIO_PROCESSOR
|
||||
if (audio_processor_.IsRunning()) {
|
||||
if (audio_processor_->IsRunning()) {
|
||||
std::vector<int16_t> data;
|
||||
int samples = audio_processor_.GetFeedSize();
|
||||
int samples = audio_processor_->GetFeedSize();
|
||||
if (samples > 0) {
|
||||
ReadAudio(data, 16000, samples);
|
||||
audio_processor_.Feed(data);
|
||||
audio_processor_->Feed(data);
|
||||
return;
|
||||
}
|
||||
}
|
||||
#else
|
||||
if (device_state_ == kDeviceStateListening) {
|
||||
std::vector<int16_t> data;
|
||||
ReadAudio(data, 16000, 30 * 16000 / 1000);
|
||||
background_task_->Schedule([this, data = std::move(data)]() mutable {
|
||||
if (protocol_->IsAudioChannelBusy()) {
|
||||
return;
|
||||
}
|
||||
opus_encoder_->Encode(std::move(data), [this](std::vector<uint8_t>&& opus) {
|
||||
AudioStreamPacket packet;
|
||||
packet.payload = std::move(opus);
|
||||
packet.timestamp = last_output_timestamp_;
|
||||
last_output_timestamp_ = 0;
|
||||
Schedule([this, packet = std::move(packet)]() {
|
||||
protocol_->SendAudio(packet);
|
||||
});
|
||||
});
|
||||
});
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
vTaskDelay(pdMS_TO_TICKS(30));
|
||||
}
|
||||
|
||||
@@ -818,9 +807,7 @@ void Application::SetDeviceState(DeviceState state) {
|
||||
case kDeviceStateIdle:
|
||||
display->SetStatus(Lang::Strings::STANDBY);
|
||||
display->SetEmotion("neutral");
|
||||
#if CONFIG_USE_AUDIO_PROCESSOR
|
||||
audio_processor_.Stop();
|
||||
#endif
|
||||
audio_processor_->Stop();
|
||||
#if CONFIG_USE_WAKE_WORD_DETECT
|
||||
wake_word_detect_.StartDetection();
|
||||
#endif
|
||||
@@ -838,11 +825,7 @@ void Application::SetDeviceState(DeviceState state) {
|
||||
UpdateIotStates();
|
||||
|
||||
// Make sure the audio processor is running
|
||||
#if CONFIG_USE_AUDIO_PROCESSOR
|
||||
if (!audio_processor_.IsRunning()) {
|
||||
#else
|
||||
if (true) {
|
||||
#endif
|
||||
if (!audio_processor_->IsRunning()) {
|
||||
// Send the start listening command
|
||||
protocol_->SendStartListening(listening_mode_);
|
||||
if (listening_mode_ == kListeningModeAutoStop && previous_state == kDeviceStateSpeaking) {
|
||||
@@ -853,18 +836,14 @@ void Application::SetDeviceState(DeviceState state) {
|
||||
#if CONFIG_USE_WAKE_WORD_DETECT
|
||||
wake_word_detect_.StopDetection();
|
||||
#endif
|
||||
#if CONFIG_USE_AUDIO_PROCESSOR
|
||||
audio_processor_.Start();
|
||||
#endif
|
||||
audio_processor_->Start();
|
||||
}
|
||||
break;
|
||||
case kDeviceStateSpeaking:
|
||||
display->SetStatus(Lang::Strings::SPEAKING);
|
||||
|
||||
if (listening_mode_ != kListeningModeRealtime) {
|
||||
#if CONFIG_USE_AUDIO_PROCESSOR
|
||||
audio_processor_.Stop();
|
||||
#endif
|
||||
audio_processor_->Stop();
|
||||
#if CONFIG_USE_WAKE_WORD_DETECT
|
||||
wake_word_detect_.StartDetection();
|
||||
#endif
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
#include <list>
|
||||
#include <vector>
|
||||
#include <condition_variable>
|
||||
#include <memory>
|
||||
|
||||
#include <opus_encoder.h>
|
||||
#include <opus_decoder.h>
|
||||
@@ -19,13 +20,11 @@
|
||||
#include "protocol.h"
|
||||
#include "ota.h"
|
||||
#include "background_task.h"
|
||||
#include "audio_processor.h"
|
||||
|
||||
#if CONFIG_USE_WAKE_WORD_DETECT
|
||||
#include "wake_word_detect.h"
|
||||
#endif
|
||||
#if CONFIG_USE_AUDIO_PROCESSOR
|
||||
#include "audio_processor.h"
|
||||
#endif
|
||||
|
||||
#define SCHEDULE_EVENT (1 << 0)
|
||||
#define AUDIO_INPUT_READY_EVENT (1 << 1)
|
||||
@@ -81,9 +80,7 @@ private:
|
||||
#if CONFIG_USE_WAKE_WORD_DETECT
|
||||
WakeWordDetect wake_word_detect_;
|
||||
#endif
|
||||
#if CONFIG_USE_AUDIO_PROCESSOR
|
||||
AudioProcessor audio_processor_;
|
||||
#endif
|
||||
std::unique_ptr<AudioProcessor> audio_processor_;
|
||||
Ota ota_;
|
||||
std::mutex mutex_;
|
||||
std::list<std::function<void()>> main_tasks_;
|
||||
|
||||
@@ -1,16 +1,16 @@
|
||||
#include "audio_processor.h"
|
||||
#include "afe_audio_processor.h"
|
||||
#include <esp_log.h>
|
||||
|
||||
#define PROCESSOR_RUNNING 0x01
|
||||
|
||||
static const char* TAG = "AudioProcessor";
|
||||
static const char* TAG = "AfeAudioProcessor";
|
||||
|
||||
AudioProcessor::AudioProcessor()
|
||||
AfeAudioProcessor::AfeAudioProcessor()
|
||||
: afe_data_(nullptr) {
|
||||
event_group_ = xEventGroupCreate();
|
||||
}
|
||||
|
||||
void AudioProcessor::Initialize(AudioCodec* codec, bool realtime_chat) {
|
||||
void AfeAudioProcessor::Initialize(AudioCodec* codec, bool realtime_chat) {
|
||||
codec_ = codec;
|
||||
int ref_num = codec_->input_reference() ? 1 : 0;
|
||||
|
||||
@@ -51,57 +51,57 @@ void AudioProcessor::Initialize(AudioCodec* codec, bool realtime_chat) {
|
||||
afe_data_ = afe_iface_->create_from_config(afe_config);
|
||||
|
||||
xTaskCreate([](void* arg) {
|
||||
auto this_ = (AudioProcessor*)arg;
|
||||
auto this_ = (AfeAudioProcessor*)arg;
|
||||
this_->AudioProcessorTask();
|
||||
vTaskDelete(NULL);
|
||||
}, "audio_communication", 4096, this, 3, NULL);
|
||||
}
|
||||
|
||||
AudioProcessor::~AudioProcessor() {
|
||||
AfeAudioProcessor::~AfeAudioProcessor() {
|
||||
if (afe_data_ != nullptr) {
|
||||
afe_iface_->destroy(afe_data_);
|
||||
}
|
||||
vEventGroupDelete(event_group_);
|
||||
}
|
||||
|
||||
size_t AudioProcessor::GetFeedSize() {
|
||||
size_t AfeAudioProcessor::GetFeedSize() {
|
||||
if (afe_data_ == nullptr) {
|
||||
return 0;
|
||||
}
|
||||
return afe_iface_->get_feed_chunksize(afe_data_) * codec_->input_channels();
|
||||
}
|
||||
|
||||
void AudioProcessor::Feed(const std::vector<int16_t>& data) {
|
||||
void AfeAudioProcessor::Feed(const std::vector<int16_t>& data) {
|
||||
if (afe_data_ == nullptr) {
|
||||
return;
|
||||
}
|
||||
afe_iface_->feed(afe_data_, data.data());
|
||||
}
|
||||
|
||||
void AudioProcessor::Start() {
|
||||
void AfeAudioProcessor::Start() {
|
||||
xEventGroupSetBits(event_group_, PROCESSOR_RUNNING);
|
||||
}
|
||||
|
||||
void AudioProcessor::Stop() {
|
||||
void AfeAudioProcessor::Stop() {
|
||||
xEventGroupClearBits(event_group_, PROCESSOR_RUNNING);
|
||||
if (afe_data_ != nullptr) {
|
||||
afe_iface_->reset_buffer(afe_data_);
|
||||
}
|
||||
}
|
||||
|
||||
bool AudioProcessor::IsRunning() {
|
||||
bool AfeAudioProcessor::IsRunning() {
|
||||
return xEventGroupGetBits(event_group_) & PROCESSOR_RUNNING;
|
||||
}
|
||||
|
||||
void AudioProcessor::OnOutput(std::function<void(std::vector<int16_t>&& data)> callback) {
|
||||
void AfeAudioProcessor::OnOutput(std::function<void(std::vector<int16_t>&& data)> callback) {
|
||||
output_callback_ = callback;
|
||||
}
|
||||
|
||||
void AudioProcessor::OnVadStateChange(std::function<void(bool speaking)> callback) {
|
||||
void AfeAudioProcessor::OnVadStateChange(std::function<void(bool speaking)> callback) {
|
||||
vad_state_change_callback_ = callback;
|
||||
}
|
||||
|
||||
void AudioProcessor::AudioProcessorTask() {
|
||||
void AfeAudioProcessor::AudioProcessorTask() {
|
||||
auto fetch_size = afe_iface_->get_fetch_chunksize(afe_data_);
|
||||
auto feed_size = afe_iface_->get_feed_chunksize(afe_data_);
|
||||
ESP_LOGI(TAG, "Audio communication task started, feed size: %d fetch size: %d",
|
||||
@@ -136,4 +136,4 @@ void AudioProcessor::AudioProcessorTask() {
|
||||
output_callback_(std::vector<int16_t>(res->data, res->data + res->data_size / sizeof(int16_t)));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
42
main/audio_processing/afe_audio_processor.h
Normal file
42
main/audio_processing/afe_audio_processor.h
Normal file
@@ -0,0 +1,42 @@
|
||||
#ifndef AFE_AUDIO_PROCESSOR_H
|
||||
#define AFE_AUDIO_PROCESSOR_H
|
||||
|
||||
#include <esp_afe_sr_models.h>
|
||||
#include <freertos/FreeRTOS.h>
|
||||
#include <freertos/task.h>
|
||||
#include <freertos/event_groups.h>
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <functional>
|
||||
|
||||
#include "audio_processor.h"
|
||||
#include "audio_codec.h"
|
||||
|
||||
class AfeAudioProcessor : public AudioProcessor {
|
||||
public:
|
||||
AfeAudioProcessor();
|
||||
~AfeAudioProcessor();
|
||||
|
||||
void Initialize(AudioCodec* codec, bool realtime_chat) override;
|
||||
void Feed(const std::vector<int16_t>& data) override;
|
||||
void Start() override;
|
||||
void Stop() override;
|
||||
bool IsRunning() override;
|
||||
void OnOutput(std::function<void(std::vector<int16_t>&& data)> callback) override;
|
||||
void OnVadStateChange(std::function<void(bool speaking)> callback) override;
|
||||
size_t GetFeedSize() override;
|
||||
|
||||
private:
|
||||
EventGroupHandle_t event_group_ = nullptr;
|
||||
esp_afe_sr_iface_t* afe_iface_ = nullptr;
|
||||
esp_afe_sr_data_t* afe_data_ = nullptr;
|
||||
std::function<void(std::vector<int16_t>&& data)> output_callback_;
|
||||
std::function<void(bool speaking)> vad_state_change_callback_;
|
||||
AudioCodec* codec_ = nullptr;
|
||||
bool is_speaking_ = false;
|
||||
|
||||
void AudioProcessorTask();
|
||||
};
|
||||
|
||||
#endif
|
||||
@@ -1,11 +1,6 @@
|
||||
#ifndef AUDIO_PROCESSOR_H
|
||||
#define AUDIO_PROCESSOR_H
|
||||
|
||||
#include <esp_afe_sr_models.h>
|
||||
#include <freertos/FreeRTOS.h>
|
||||
#include <freertos/task.h>
|
||||
#include <freertos/event_groups.h>
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <functional>
|
||||
@@ -14,28 +9,16 @@
|
||||
|
||||
class AudioProcessor {
|
||||
public:
|
||||
AudioProcessor();
|
||||
~AudioProcessor();
|
||||
|
||||
void Initialize(AudioCodec* codec, bool realtime_chat);
|
||||
void Feed(const std::vector<int16_t>& data);
|
||||
void Start();
|
||||
void Stop();
|
||||
bool IsRunning();
|
||||
void OnOutput(std::function<void(std::vector<int16_t>&& data)> callback);
|
||||
void OnVadStateChange(std::function<void(bool speaking)> callback);
|
||||
size_t GetFeedSize();
|
||||
|
||||
private:
|
||||
EventGroupHandle_t event_group_ = nullptr;
|
||||
esp_afe_sr_iface_t* afe_iface_ = nullptr;
|
||||
esp_afe_sr_data_t* afe_data_ = nullptr;
|
||||
std::function<void(std::vector<int16_t>&& data)> output_callback_;
|
||||
std::function<void(bool speaking)> vad_state_change_callback_;
|
||||
AudioCodec* codec_ = nullptr;
|
||||
bool is_speaking_ = false;
|
||||
|
||||
void AudioProcessorTask();
|
||||
virtual ~AudioProcessor() = default;
|
||||
|
||||
virtual void Initialize(AudioCodec* codec, bool realtime_chat) = 0;
|
||||
virtual void Feed(const std::vector<int16_t>& data) = 0;
|
||||
virtual void Start() = 0;
|
||||
virtual void Stop() = 0;
|
||||
virtual bool IsRunning() = 0;
|
||||
virtual void OnOutput(std::function<void(std::vector<int16_t>&& data)> callback) = 0;
|
||||
virtual void OnVadStateChange(std::function<void(bool speaking)> callback) = 0;
|
||||
virtual size_t GetFeedSize() = 0;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
44
main/audio_processing/dummy_audio_processor.cc
Normal file
44
main/audio_processing/dummy_audio_processor.cc
Normal file
@@ -0,0 +1,44 @@
|
||||
#include "dummy_audio_processor.h"
|
||||
#include <esp_log.h>
|
||||
|
||||
static const char* TAG = "DummyAudioProcessor";
|
||||
|
||||
void DummyAudioProcessor::Initialize(AudioCodec* codec, bool realtime_chat) {
|
||||
codec_ = codec;
|
||||
}
|
||||
|
||||
void DummyAudioProcessor::Feed(const std::vector<int16_t>& data) {
|
||||
if (!is_running_ || !output_callback_) {
|
||||
return;
|
||||
}
|
||||
// 直接将输入数据传递给输出回调
|
||||
output_callback_(std::vector<int16_t>(data));
|
||||
}
|
||||
|
||||
void DummyAudioProcessor::Start() {
|
||||
is_running_ = true;
|
||||
}
|
||||
|
||||
void DummyAudioProcessor::Stop() {
|
||||
is_running_ = false;
|
||||
}
|
||||
|
||||
bool DummyAudioProcessor::IsRunning() {
|
||||
return is_running_;
|
||||
}
|
||||
|
||||
void DummyAudioProcessor::OnOutput(std::function<void(std::vector<int16_t>&& data)> callback) {
|
||||
output_callback_ = callback;
|
||||
}
|
||||
|
||||
void DummyAudioProcessor::OnVadStateChange(std::function<void(bool speaking)> callback) {
|
||||
vad_state_change_callback_ = callback;
|
||||
}
|
||||
|
||||
size_t DummyAudioProcessor::GetFeedSize() {
|
||||
if (!codec_) {
|
||||
return 0;
|
||||
}
|
||||
// 返回一个固定的帧大小,比如 30ms 的数据
|
||||
return 30 * codec_->input_sample_rate() / 1000;
|
||||
}
|
||||
31
main/audio_processing/dummy_audio_processor.h
Normal file
31
main/audio_processing/dummy_audio_processor.h
Normal file
@@ -0,0 +1,31 @@
|
||||
#ifndef DUMMY_AUDIO_PROCESSOR_H
|
||||
#define DUMMY_AUDIO_PROCESSOR_H
|
||||
|
||||
#include <vector>
|
||||
#include <functional>
|
||||
|
||||
#include "audio_processor.h"
|
||||
#include "audio_codec.h"
|
||||
|
||||
class DummyAudioProcessor : public AudioProcessor {
|
||||
public:
|
||||
DummyAudioProcessor() = default;
|
||||
~DummyAudioProcessor() = default;
|
||||
|
||||
void Initialize(AudioCodec* codec, bool realtime_chat) override;
|
||||
void Feed(const std::vector<int16_t>& data) override;
|
||||
void Start() override;
|
||||
void Stop() override;
|
||||
bool IsRunning() override;
|
||||
void OnOutput(std::function<void(std::vector<int16_t>&& data)> callback) override;
|
||||
void OnVadStateChange(std::function<void(bool speaking)> callback) override;
|
||||
size_t GetFeedSize() override;
|
||||
|
||||
private:
|
||||
AudioCodec* codec_ = nullptr;
|
||||
std::function<void(std::vector<int16_t>&& data)> output_callback_;
|
||||
std::function<void(bool speaking)> vad_state_change_callback_;
|
||||
bool is_running_ = false;
|
||||
};
|
||||
|
||||
#endif
|
||||
Reference in New Issue
Block a user