forked from xiaozhi/xiaozhi-esp32
Fix Server AEC
This commit is contained in:
@@ -11,8 +11,8 @@ class AudioProcessor {
|
|||||||
public:
|
public:
|
||||||
virtual ~AudioProcessor() = default;
|
virtual ~AudioProcessor() = default;
|
||||||
|
|
||||||
virtual void Initialize(AudioCodec* codec) = 0;
|
virtual void Initialize(AudioCodec* codec, int frame_duration_ms) = 0;
|
||||||
virtual void Feed(const std::vector<int16_t>& data) = 0;
|
virtual void Feed(std::vector<int16_t>&& data) = 0;
|
||||||
virtual void Start() = 0;
|
virtual void Start() = 0;
|
||||||
virtual void Stop() = 0;
|
virtual void Stop() = 0;
|
||||||
virtual bool IsRunning() = 0;
|
virtual bool IsRunning() = 0;
|
||||||
|
|||||||
@@ -43,7 +43,6 @@ void AudioService::Initialize(AudioCodec* codec) {
|
|||||||
reference_resampler_.Configure(codec->input_sample_rate(), 16000);
|
reference_resampler_.Configure(codec->input_sample_rate(), 16000);
|
||||||
}
|
}
|
||||||
|
|
||||||
audio_debugger_ = std::make_unique<AudioDebugger>();
|
|
||||||
#if CONFIG_USE_AUDIO_PROCESSOR
|
#if CONFIG_USE_AUDIO_PROCESSOR
|
||||||
audio_processor_ = std::make_unique<AfeAudioProcessor>();
|
audio_processor_ = std::make_unique<AfeAudioProcessor>();
|
||||||
#else
|
#else
|
||||||
@@ -118,7 +117,7 @@ void AudioService::Start() {
|
|||||||
AudioService* audio_service = (AudioService*)arg;
|
AudioService* audio_service = (AudioService*)arg;
|
||||||
audio_service->AudioOutputTask();
|
audio_service->AudioOutputTask();
|
||||||
vTaskDelete(NULL);
|
vTaskDelete(NULL);
|
||||||
}, "audio_output", 2048, this, 3, &audio_output_task_handle_);
|
}, "audio_output", 4096, this, 3, &audio_output_task_handle_);
|
||||||
|
|
||||||
/* Start the opus codec task */
|
/* Start the opus codec task */
|
||||||
xTaskCreate([](void* arg) {
|
xTaskCreate([](void* arg) {
|
||||||
@@ -186,10 +185,13 @@ bool AudioService::ReadAudioData(std::vector<int16_t>& data, int sample_rate, in
|
|||||||
last_input_time_ = std::chrono::steady_clock::now();
|
last_input_time_ = std::chrono::steady_clock::now();
|
||||||
debug_statistics_.input_count++;
|
debug_statistics_.input_count++;
|
||||||
|
|
||||||
|
#if CONFIG_USE_AUDIO_DEBUGGER
|
||||||
// 音频调试:发送原始音频数据
|
// 音频调试:发送原始音频数据
|
||||||
if (audio_debugger_) {
|
if (audio_debugger_ == nullptr) {
|
||||||
audio_debugger_->Feed(data);
|
audio_debugger_ = std::make_unique<AudioDebugger>();
|
||||||
}
|
}
|
||||||
|
audio_debugger_->Feed(data);
|
||||||
|
#endif
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@@ -250,7 +252,7 @@ void AudioService::AudioInputTask() {
|
|||||||
int samples = audio_processor_->GetFeedSize();
|
int samples = audio_processor_->GetFeedSize();
|
||||||
if (samples > 0) {
|
if (samples > 0) {
|
||||||
if (ReadAudioData(data, 16000, samples)) {
|
if (ReadAudioData(data, 16000, samples)) {
|
||||||
audio_processor_->Feed(data);
|
audio_processor_->Feed(std::move(data));
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -285,6 +287,12 @@ void AudioService::AudioOutputTask() {
|
|||||||
/* Update the last output time */
|
/* Update the last output time */
|
||||||
last_output_time_ = std::chrono::steady_clock::now();
|
last_output_time_ = std::chrono::steady_clock::now();
|
||||||
debug_statistics_.playback_count++;
|
debug_statistics_.playback_count++;
|
||||||
|
|
||||||
|
/* Record the timestamp */
|
||||||
|
if (task->timestamp > 0) {
|
||||||
|
lock.lock();
|
||||||
|
timestamp_queue_.push_back(task->timestamp);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ESP_LOGW(TAG, "Audio output task stopped");
|
ESP_LOGW(TAG, "Audio output task stopped");
|
||||||
@@ -311,6 +319,7 @@ void AudioService::OpusCodecTask() {
|
|||||||
|
|
||||||
auto task = std::make_unique<AudioTask>();
|
auto task = std::make_unique<AudioTask>();
|
||||||
task->type = kAudioTaskTypeDecodeToPlaybackQueue;
|
task->type = kAudioTaskTypeDecodeToPlaybackQueue;
|
||||||
|
task->timestamp = packet->timestamp;
|
||||||
|
|
||||||
SetDecodeSampleRate(packet->sample_rate, packet->frame_duration);
|
SetDecodeSampleRate(packet->sample_rate, packet->frame_duration);
|
||||||
if (opus_decoder_->Decode(std::move(packet->payload), task->pcm)) {
|
if (opus_decoder_->Decode(std::move(packet->payload), task->pcm)) {
|
||||||
@@ -338,11 +347,15 @@ void AudioService::OpusCodecTask() {
|
|||||||
audio_encode_queue_.pop_front();
|
audio_encode_queue_.pop_front();
|
||||||
audio_queue_cv_.notify_all();
|
audio_queue_cv_.notify_all();
|
||||||
lock.unlock();
|
lock.unlock();
|
||||||
opus_encoder_->Encode(std::move(task->pcm), [this, &task](std::vector<uint8_t>&& opus) {
|
|
||||||
auto packet = std::make_unique<AudioStreamPacket>();
|
auto packet = std::make_unique<AudioStreamPacket>();
|
||||||
packet->payload = std::move(opus);
|
|
||||||
packet->frame_duration = OPUS_FRAME_DURATION_MS;
|
packet->frame_duration = OPUS_FRAME_DURATION_MS;
|
||||||
packet->sample_rate = 16000;
|
packet->sample_rate = 16000;
|
||||||
|
packet->timestamp = task->timestamp;
|
||||||
|
if (!opus_encoder_->Encode(std::move(task->pcm), packet->payload)) {
|
||||||
|
ESP_LOGE(TAG, "Failed to encode audio");
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
if (task->type == kAudioTaskTypeEncodeToSendQueue) {
|
if (task->type == kAudioTaskTypeEncodeToSendQueue) {
|
||||||
{
|
{
|
||||||
@@ -356,7 +369,6 @@ void AudioService::OpusCodecTask() {
|
|||||||
std::lock_guard<std::mutex> lock(audio_queue_mutex_);
|
std::lock_guard<std::mutex> lock(audio_queue_mutex_);
|
||||||
audio_testing_queue_.push_back(std::move(packet));
|
audio_testing_queue_.push_back(std::move(packet));
|
||||||
}
|
}
|
||||||
});
|
|
||||||
debug_statistics_.encode_count++;
|
debug_statistics_.encode_count++;
|
||||||
lock.lock();
|
lock.lock();
|
||||||
}
|
}
|
||||||
@@ -387,6 +399,17 @@ void AudioService::PushTaskToEncodeQueue(AudioTaskType type, std::vector<int16_t
|
|||||||
|
|
||||||
/* Push the task to the encode queue */
|
/* Push the task to the encode queue */
|
||||||
std::unique_lock<std::mutex> lock(audio_queue_mutex_);
|
std::unique_lock<std::mutex> lock(audio_queue_mutex_);
|
||||||
|
|
||||||
|
/* If the task is to send queue, we need to set the timestamp */
|
||||||
|
if (type == kAudioTaskTypeEncodeToSendQueue && !timestamp_queue_.empty()) {
|
||||||
|
if (timestamp_queue_.size() <= MAX_TIMESTAMPS_IN_QUEUE) {
|
||||||
|
task->timestamp = timestamp_queue_.front();
|
||||||
|
} else {
|
||||||
|
ESP_LOGW(TAG, "Timestamp queue is full, dropping timestamp");
|
||||||
|
}
|
||||||
|
timestamp_queue_.pop_front();
|
||||||
|
}
|
||||||
|
|
||||||
audio_queue_cv_.wait(lock, [this]() { return audio_encode_queue_.size() < MAX_ENCODE_TASKS_IN_QUEUE; });
|
audio_queue_cv_.wait(lock, [this]() { return audio_encode_queue_.size() < MAX_ENCODE_TASKS_IN_QUEUE; });
|
||||||
audio_encode_queue_.push_back(std::move(task));
|
audio_encode_queue_.push_back(std::move(task));
|
||||||
audio_queue_cv_.notify_all();
|
audio_queue_cv_.notify_all();
|
||||||
@@ -458,7 +481,7 @@ void AudioService::EnableVoiceProcessing(bool enable) {
|
|||||||
ESP_LOGD(TAG, "%s voice processing", enable ? "Enabling" : "Disabling");
|
ESP_LOGD(TAG, "%s voice processing", enable ? "Enabling" : "Disabling");
|
||||||
if (enable) {
|
if (enable) {
|
||||||
if (!audio_processor_initialized_) {
|
if (!audio_processor_initialized_) {
|
||||||
audio_processor_->Initialize(codec_);
|
audio_processor_->Initialize(codec_, OPUS_FRAME_DURATION_MS);
|
||||||
audio_processor_initialized_ = true;
|
audio_processor_initialized_ = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -522,6 +545,7 @@ bool AudioService::IsIdle() {
|
|||||||
void AudioService::ResetDecoder() {
|
void AudioService::ResetDecoder() {
|
||||||
std::lock_guard<std::mutex> lock(audio_queue_mutex_);
|
std::lock_guard<std::mutex> lock(audio_queue_mutex_);
|
||||||
opus_decoder_->ResetState();
|
opus_decoder_->ResetState();
|
||||||
|
timestamp_queue_.clear();
|
||||||
audio_decode_queue_.clear();
|
audio_decode_queue_.clear();
|
||||||
audio_playback_queue_.clear();
|
audio_playback_queue_.clear();
|
||||||
audio_testing_queue_.clear();
|
audio_testing_queue_.clear();
|
||||||
|
|||||||
@@ -40,6 +40,7 @@
|
|||||||
#define MAX_DECODE_PACKETS_IN_QUEUE (2400 / OPUS_FRAME_DURATION_MS)
|
#define MAX_DECODE_PACKETS_IN_QUEUE (2400 / OPUS_FRAME_DURATION_MS)
|
||||||
#define MAX_SEND_PACKETS_IN_QUEUE (2400 / OPUS_FRAME_DURATION_MS)
|
#define MAX_SEND_PACKETS_IN_QUEUE (2400 / OPUS_FRAME_DURATION_MS)
|
||||||
#define AUDIO_TESTING_MAX_DURATION_MS 10000
|
#define AUDIO_TESTING_MAX_DURATION_MS 10000
|
||||||
|
#define MAX_TIMESTAMPS_IN_QUEUE 3
|
||||||
|
|
||||||
#define AUDIO_POWER_TIMEOUT_MS 15000
|
#define AUDIO_POWER_TIMEOUT_MS 15000
|
||||||
#define AUDIO_POWER_CHECK_INTERVAL_MS 1000
|
#define AUDIO_POWER_CHECK_INTERVAL_MS 1000
|
||||||
@@ -67,6 +68,7 @@ enum AudioTaskType {
|
|||||||
struct AudioTask {
|
struct AudioTask {
|
||||||
AudioTaskType type;
|
AudioTaskType type;
|
||||||
std::vector<int16_t> pcm;
|
std::vector<int16_t> pcm;
|
||||||
|
uint32_t timestamp;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct DebugStatistics {
|
struct DebugStatistics {
|
||||||
|
|||||||
@@ -10,8 +10,14 @@ AfeAudioProcessor::AfeAudioProcessor()
|
|||||||
event_group_ = xEventGroupCreate();
|
event_group_ = xEventGroupCreate();
|
||||||
}
|
}
|
||||||
|
|
||||||
void AfeAudioProcessor::Initialize(AudioCodec* codec) {
|
void AfeAudioProcessor::Initialize(AudioCodec* codec, int frame_duration_ms) {
|
||||||
codec_ = codec;
|
codec_ = codec;
|
||||||
|
frame_duration_ms_ = frame_duration_ms;
|
||||||
|
frame_samples_ = frame_duration_ms_ * codec_->input_sample_rate() / 1000;
|
||||||
|
|
||||||
|
// Pre-allocate output buffer capacity
|
||||||
|
output_buffer_.reserve(frame_samples_);
|
||||||
|
|
||||||
int ref_num = codec_->input_reference() ? 1 : 0;
|
int ref_num = codec_->input_reference() ? 1 : 0;
|
||||||
|
|
||||||
std::string input_format;
|
std::string input_format;
|
||||||
@@ -79,7 +85,7 @@ size_t AfeAudioProcessor::GetFeedSize() {
|
|||||||
return afe_iface_->get_feed_chunksize(afe_data_) * codec_->input_channels();
|
return afe_iface_->get_feed_chunksize(afe_data_) * codec_->input_channels();
|
||||||
}
|
}
|
||||||
|
|
||||||
void AfeAudioProcessor::Feed(const std::vector<int16_t>& data) {
|
void AfeAudioProcessor::Feed(std::vector<int16_t>&& data) {
|
||||||
if (afe_data_ == nullptr) {
|
if (afe_data_ == nullptr) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@@ -141,7 +147,24 @@ void AfeAudioProcessor::AudioProcessorTask() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (output_callback_) {
|
if (output_callback_) {
|
||||||
output_callback_(std::vector<int16_t>(res->data, res->data + res->data_size / sizeof(int16_t)));
|
size_t samples = res->data_size / sizeof(int16_t);
|
||||||
|
|
||||||
|
// Add data to buffer
|
||||||
|
output_buffer_.insert(output_buffer_.end(), res->data, res->data + samples);
|
||||||
|
|
||||||
|
// Output complete frames when buffer has enough data
|
||||||
|
while (output_buffer_.size() >= frame_samples_) {
|
||||||
|
if (output_buffer_.size() == frame_samples_) {
|
||||||
|
// If buffer size equals frame size, move the entire buffer
|
||||||
|
output_callback_(std::move(output_buffer_));
|
||||||
|
output_buffer_.clear();
|
||||||
|
output_buffer_.reserve(frame_samples_);
|
||||||
|
} else {
|
||||||
|
// If buffer size exceeds frame size, copy one frame and remove it
|
||||||
|
output_callback_(std::vector<int16_t>(output_buffer_.begin(), output_buffer_.begin() + frame_samples_));
|
||||||
|
output_buffer_.erase(output_buffer_.begin(), output_buffer_.begin() + frame_samples_);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -18,8 +18,8 @@ public:
|
|||||||
AfeAudioProcessor();
|
AfeAudioProcessor();
|
||||||
~AfeAudioProcessor();
|
~AfeAudioProcessor();
|
||||||
|
|
||||||
void Initialize(AudioCodec* codec) override;
|
void Initialize(AudioCodec* codec, int frame_duration_ms) override;
|
||||||
void Feed(const std::vector<int16_t>& data) override;
|
void Feed(std::vector<int16_t>&& data) override;
|
||||||
void Start() override;
|
void Start() override;
|
||||||
void Stop() override;
|
void Stop() override;
|
||||||
bool IsRunning() override;
|
bool IsRunning() override;
|
||||||
@@ -35,7 +35,10 @@ private:
|
|||||||
std::function<void(std::vector<int16_t>&& data)> output_callback_;
|
std::function<void(std::vector<int16_t>&& data)> output_callback_;
|
||||||
std::function<void(bool speaking)> vad_state_change_callback_;
|
std::function<void(bool speaking)> vad_state_change_callback_;
|
||||||
AudioCodec* codec_ = nullptr;
|
AudioCodec* codec_ = nullptr;
|
||||||
|
int frame_duration_ms_ = 0;
|
||||||
|
int frame_samples_ = 0;
|
||||||
bool is_speaking_ = false;
|
bool is_speaking_ = false;
|
||||||
|
std::vector<int16_t> output_buffer_;
|
||||||
|
|
||||||
void AudioProcessorTask();
|
void AudioProcessorTask();
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -3,16 +3,32 @@
|
|||||||
|
|
||||||
#define TAG "NoAudioProcessor"
|
#define TAG "NoAudioProcessor"
|
||||||
|
|
||||||
void NoAudioProcessor::Initialize(AudioCodec* codec) {
|
void NoAudioProcessor::Initialize(AudioCodec* codec, int frame_duration_ms) :
|
||||||
codec_ = codec;
|
codec_(codec),
|
||||||
|
frame_duration_ms_(frame_duration_ms) {
|
||||||
|
frame_samples_ = frame_duration_ms_ * codec_->input_sample_rate() / 1000;
|
||||||
}
|
}
|
||||||
|
|
||||||
void NoAudioProcessor::Feed(const std::vector<int16_t>& data) {
|
void NoAudioProcessor::Feed(std::vector<int16_t>&& data) {
|
||||||
if (!is_running_ || !output_callback_) {
|
if (!is_running_ || !output_callback_) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
// 直接将输入数据传递给输出回调
|
|
||||||
output_callback_(std::vector<int16_t>(data));
|
if (data.size() != frame_samples_) {
|
||||||
|
ESP_LOGE(TAG, "Feed data size is not equal to frame size, feed size: %u, frame size: %u", data.size(), frame_samples_);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (codec_->input_channels() == 2) {
|
||||||
|
// If input channels is 2, we need to fetch the left channel data
|
||||||
|
auto mono_data = std::vector<int16_t>(data.size() / 2);
|
||||||
|
for (size_t i = 0, j = 0; i < mono_data.size(); ++i, j += 2) {
|
||||||
|
mono_data[i] = data[j];
|
||||||
|
}
|
||||||
|
output_callback_(std::move(mono_data));
|
||||||
|
} else {
|
||||||
|
output_callback_(std::move(data));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void NoAudioProcessor::Start() {
|
void NoAudioProcessor::Start() {
|
||||||
@@ -39,8 +55,7 @@ size_t NoAudioProcessor::GetFeedSize() {
|
|||||||
if (!codec_) {
|
if (!codec_) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
// 返回一个固定的帧大小,比如 30ms 的数据
|
return frame_samples_;
|
||||||
return 30 * codec_->input_sample_rate() / 1000;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void NoAudioProcessor::EnableDeviceAec(bool enable) {
|
void NoAudioProcessor::EnableDeviceAec(bool enable) {
|
||||||
|
|||||||
@@ -12,8 +12,8 @@ public:
|
|||||||
NoAudioProcessor() = default;
|
NoAudioProcessor() = default;
|
||||||
~NoAudioProcessor() = default;
|
~NoAudioProcessor() = default;
|
||||||
|
|
||||||
void Initialize(AudioCodec* codec) override;
|
void Initialize(AudioCodec* codec, int frame_duration_ms) override;
|
||||||
void Feed(const std::vector<int16_t>& data) override;
|
void Feed(std::vector<int16_t>&& data) override;
|
||||||
void Start() override;
|
void Start() override;
|
||||||
void Stop() override;
|
void Stop() override;
|
||||||
bool IsRunning() override;
|
bool IsRunning() override;
|
||||||
@@ -24,6 +24,8 @@ public:
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
AudioCodec* codec_ = nullptr;
|
AudioCodec* codec_ = nullptr;
|
||||||
|
int frame_duration_ms_ = 0;
|
||||||
|
int frame_samples_ = 0;
|
||||||
std::function<void(std::vector<int16_t>&& data)> output_callback_;
|
std::function<void(std::vector<int16_t>&& data)> output_callback_;
|
||||||
std::function<void(bool speaking)> vad_state_change_callback_;
|
std::function<void(bool speaking)> vad_state_change_callback_;
|
||||||
bool is_running_ = false;
|
bool is_running_ = false;
|
||||||
|
|||||||
Reference in New Issue
Block a user