forked from xiaozhi/xiaozhi-esp32
feat(audio): Use esp_audio_codec and esp_audio_effects to replace 78opus (#1632)
This commit is contained in:
@@ -2,6 +2,26 @@
|
||||
#include <esp_log.h>
|
||||
#include <cstring>
|
||||
|
||||
#define RATE_CVT_CFG(_src_rate, _dest_rate, _channel) \
|
||||
(esp_ae_rate_cvt_cfg_t) \
|
||||
{ \
|
||||
.src_rate = (uint32_t)(_src_rate), \
|
||||
.dest_rate = (uint32_t)(_dest_rate), \
|
||||
.channel = (uint8_t)(_channel), \
|
||||
.bits_per_sample = ESP_AUDIO_BIT16, \
|
||||
.complexity = 2, \
|
||||
.perf_type = ESP_AE_RATE_CVT_PERF_TYPE_SPEED, \
|
||||
}
|
||||
|
||||
#define OPUS_DEC_CFG(_sample_rate, _frame_duration_ms) \
|
||||
(esp_opus_dec_cfg_t) \
|
||||
{ \
|
||||
.sample_rate = (uint32_t)(_sample_rate), \
|
||||
.channel = ESP_AUDIO_MONO, \
|
||||
.frame_duration = (esp_opus_dec_frame_duration_t)AS_OPUS_GET_FRAME_DRU_ENUM(_frame_duration_ms), \
|
||||
.self_delimited = false, \
|
||||
}
|
||||
|
||||
#if CONFIG_USE_AUDIO_PROCESSOR
|
||||
#include "processors/afe_audio_processor.h"
|
||||
#else
|
||||
@@ -17,7 +37,6 @@
|
||||
|
||||
#define TAG "AudioService"
|
||||
|
||||
|
||||
AudioService::AudioService() {
|
||||
event_group_ = xEventGroupCreate();
|
||||
}
|
||||
@@ -26,21 +45,51 @@ AudioService::~AudioService() {
|
||||
if (event_group_ != nullptr) {
|
||||
vEventGroupDelete(event_group_);
|
||||
}
|
||||
if (opus_encoder_ != nullptr) {
|
||||
esp_opus_enc_close(opus_encoder_);
|
||||
}
|
||||
if (opus_decoder_ != nullptr) {
|
||||
esp_opus_dec_close(opus_decoder_);
|
||||
}
|
||||
if (input_resampler_ != nullptr) {
|
||||
esp_ae_rate_cvt_close(input_resampler_);
|
||||
}
|
||||
if (output_resampler_ != nullptr) {
|
||||
esp_ae_rate_cvt_close(output_resampler_);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void AudioService::Initialize(AudioCodec* codec) {
|
||||
codec_ = codec;
|
||||
codec_->Start();
|
||||
|
||||
/* Setup the audio codec */
|
||||
opus_decoder_ = std::make_unique<OpusDecoderWrapper>(codec->output_sample_rate(), 1, OPUS_FRAME_DURATION_MS);
|
||||
opus_encoder_ = std::make_unique<OpusEncoderWrapper>(16000, 1, OPUS_FRAME_DURATION_MS);
|
||||
opus_encoder_->SetComplexity(0);
|
||||
esp_opus_dec_cfg_t opus_dec_cfg = OPUS_DEC_CFG(codec->output_sample_rate(), OPUS_FRAME_DURATION_MS);
|
||||
auto ret = esp_opus_dec_open(&opus_dec_cfg, sizeof(esp_opus_dec_cfg_t), &opus_decoder_);
|
||||
if (opus_decoder_ == nullptr) {
|
||||
ESP_LOGE(TAG, "Failed to create audio decoder, error code: %d", ret);
|
||||
} else {
|
||||
decoder_sample_rate_ = codec->output_sample_rate();
|
||||
decoder_duration_ms_ = OPUS_FRAME_DURATION_MS;
|
||||
decoder_frame_size_ = decoder_sample_rate_ / 1000 * OPUS_FRAME_DURATION_MS;
|
||||
}
|
||||
esp_opus_enc_config_t opus_enc_cfg = AS_OPUS_ENC_CONFIG();
|
||||
ret = esp_opus_enc_open(&opus_enc_cfg, sizeof(esp_opus_enc_config_t), &opus_encoder_);
|
||||
if (opus_encoder_ == nullptr) {
|
||||
ESP_LOGE(TAG, "Failed to create audio encoder, error code: %d", ret);
|
||||
} else {
|
||||
encoder_sample_rate_ = 16000;
|
||||
encoder_duration_ms_ = OPUS_FRAME_DURATION_MS;
|
||||
esp_opus_enc_get_frame_size(opus_encoder_, &encoder_frame_size_, &encoder_outbuf_size_);
|
||||
encoder_frame_size_ = encoder_frame_size_ / sizeof(int16_t);
|
||||
}
|
||||
|
||||
if (codec->input_sample_rate() != 16000) {
|
||||
input_resampler_.Configure(codec->input_sample_rate(), 16000);
|
||||
reference_resampler_.Configure(codec->input_sample_rate(), 16000);
|
||||
esp_ae_rate_cvt_cfg_t input_resampler_cfg = RATE_CVT_CFG(
|
||||
codec->input_sample_rate(), ESP_AUDIO_SAMPLE_RATE_16K, codec->input_channels());
|
||||
auto resampler_ret = esp_ae_rate_cvt_open(&input_resampler_cfg, &input_resampler_);
|
||||
if (input_resampler_ == nullptr) {
|
||||
ESP_LOGE(TAG, "Failed to create input resampler, error code: %d", resampler_ret);
|
||||
}
|
||||
}
|
||||
|
||||
#if CONFIG_USE_AUDIO_PROCESSOR
|
||||
@@ -114,7 +163,7 @@ void AudioService::Start() {
|
||||
AudioService* audio_service = (AudioService*)arg;
|
||||
audio_service->OpusCodecTask();
|
||||
vTaskDelete(NULL);
|
||||
}, "opus_codec", 2048 * 13, this, 2, &opus_codec_task_handle_);
|
||||
}, "opus_codec", 2048 * 12, this, 2, &opus_codec_task_handle_);
|
||||
}
|
||||
|
||||
void AudioService::Stop() {
|
||||
@@ -144,25 +193,15 @@ bool AudioService::ReadAudioData(std::vector<int16_t>& data, int sample_rate, in
|
||||
if (!codec_->InputData(data)) {
|
||||
return false;
|
||||
}
|
||||
if (codec_->input_channels() == 2) {
|
||||
auto mic_channel = std::vector<int16_t>(data.size() / 2);
|
||||
auto reference_channel = std::vector<int16_t>(data.size() / 2);
|
||||
for (size_t i = 0, j = 0; i < mic_channel.size(); ++i, j += 2) {
|
||||
mic_channel[i] = data[j];
|
||||
reference_channel[i] = data[j + 1];
|
||||
}
|
||||
auto resampled_mic = std::vector<int16_t>(input_resampler_.GetOutputSamples(mic_channel.size()));
|
||||
auto resampled_reference = std::vector<int16_t>(reference_resampler_.GetOutputSamples(reference_channel.size()));
|
||||
input_resampler_.Process(mic_channel.data(), mic_channel.size(), resampled_mic.data());
|
||||
reference_resampler_.Process(reference_channel.data(), reference_channel.size(), resampled_reference.data());
|
||||
data.resize(resampled_mic.size() + resampled_reference.size());
|
||||
for (size_t i = 0, j = 0; i < resampled_mic.size(); ++i, j += 2) {
|
||||
data[j] = resampled_mic[i];
|
||||
data[j + 1] = resampled_reference[i];
|
||||
}
|
||||
} else {
|
||||
auto resampled = std::vector<int16_t>(input_resampler_.GetOutputSamples(data.size()));
|
||||
input_resampler_.Process(data.data(), data.size(), resampled.data());
|
||||
if (input_resampler_ != nullptr) {
|
||||
uint32_t in_sample_num = data.size() / codec_->input_channels();
|
||||
uint32_t output_samples = 0;
|
||||
esp_ae_rate_cvt_get_max_out_sample_num(input_resampler_, in_sample_num, &output_samples);
|
||||
auto resampled = std::vector<int16_t>(output_samples * codec_->input_channels());
|
||||
uint32_t actual_output = output_samples;
|
||||
esp_ae_rate_cvt_process(input_resampler_, (esp_ae_sample_t)data.data(), in_sample_num,
|
||||
(esp_ae_sample_t)resampled.data(), &actual_output);
|
||||
resampled.resize(actual_output * codec_->input_channels());
|
||||
data = std::move(resampled);
|
||||
}
|
||||
} else {
|
||||
@@ -316,25 +355,49 @@ void AudioService::OpusCodecTask() {
|
||||
task->timestamp = packet->timestamp;
|
||||
|
||||
SetDecodeSampleRate(packet->sample_rate, packet->frame_duration);
|
||||
if (opus_decoder_->Decode(std::move(packet->payload), task->pcm)) {
|
||||
// Resample if the sample rate is different
|
||||
if (opus_decoder_->sample_rate() != codec_->output_sample_rate()) {
|
||||
int target_size = output_resampler_.GetOutputSamples(task->pcm.size());
|
||||
std::vector<int16_t> resampled(target_size);
|
||||
output_resampler_.Process(task->pcm.data(), task->pcm.size(), resampled.data());
|
||||
task->pcm = std::move(resampled);
|
||||
if (opus_decoder_ != nullptr) {
|
||||
task->pcm.resize(decoder_frame_size_);
|
||||
esp_audio_dec_in_raw_t raw = {
|
||||
.buffer = (uint8_t *)(packet->payload.data()),
|
||||
.len = (uint32_t)(packet->payload.size()),
|
||||
.consumed = 0,
|
||||
.frame_recover = ESP_AUDIO_DEC_RECOVERY_NONE,
|
||||
};
|
||||
esp_audio_dec_out_frame_t out_frame = {
|
||||
.buffer = (uint8_t *)(task->pcm.data()),
|
||||
.len = (uint32_t)(task->pcm.size() * sizeof(int16_t)),
|
||||
.decoded_size = 0,
|
||||
};
|
||||
esp_audio_dec_info_t dec_info = {};
|
||||
std::unique_lock<std::mutex> decoder_lock(decoder_mutex_);
|
||||
auto ret = esp_opus_dec_decode(opus_decoder_, &raw, &out_frame, &dec_info);
|
||||
decoder_lock.unlock();
|
||||
if (ret == ESP_AUDIO_ERR_OK) {
|
||||
task->pcm.resize(out_frame.decoded_size / sizeof(int16_t));
|
||||
if (decoder_sample_rate_ != codec_->output_sample_rate() && output_resampler_ != nullptr) {
|
||||
uint32_t target_size = 0;
|
||||
esp_ae_rate_cvt_get_max_out_sample_num(output_resampler_, task->pcm.size(), &target_size);
|
||||
std::vector<int16_t> resampled(target_size);
|
||||
uint32_t actual_output = target_size;
|
||||
esp_ae_rate_cvt_process(output_resampler_, (esp_ae_sample_t)task->pcm.data(), task->pcm.size(),
|
||||
(esp_ae_sample_t)resampled.data(), &actual_output);
|
||||
resampled.resize(actual_output);
|
||||
task->pcm = std::move(resampled);
|
||||
}
|
||||
lock.lock();
|
||||
audio_playback_queue_.push_back(std::move(task));
|
||||
audio_queue_cv_.notify_all();
|
||||
debug_statistics_.decode_count++;
|
||||
} else {
|
||||
ESP_LOGE(TAG, "Failed to decode audio after resize, error code: %d", ret);
|
||||
lock.lock();
|
||||
}
|
||||
|
||||
lock.lock();
|
||||
audio_playback_queue_.push_back(std::move(task));
|
||||
audio_queue_cv_.notify_all();
|
||||
} else {
|
||||
ESP_LOGE(TAG, "Failed to decode audio");
|
||||
ESP_LOGE(TAG, "Audio decoder is not configured");
|
||||
lock.lock();
|
||||
}
|
||||
debug_statistics_.decode_count++;
|
||||
}
|
||||
|
||||
/* Encode the audio to send queue */
|
||||
if (!audio_encode_queue_.empty() && audio_send_queue_.size() < MAX_SEND_PACKETS_IN_QUEUE) {
|
||||
auto task = std::move(audio_encode_queue_.front());
|
||||
@@ -346,24 +409,42 @@ void AudioService::OpusCodecTask() {
|
||||
packet->frame_duration = OPUS_FRAME_DURATION_MS;
|
||||
packet->sample_rate = 16000;
|
||||
packet->timestamp = task->timestamp;
|
||||
if (!opus_encoder_->Encode(std::move(task->pcm), packet->payload)) {
|
||||
ESP_LOGE(TAG, "Failed to encode audio");
|
||||
continue;
|
||||
}
|
||||
|
||||
if (task->type == kAudioTaskTypeEncodeToSendQueue) {
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(audio_queue_mutex_);
|
||||
audio_send_queue_.push_back(std::move(packet));
|
||||
if (opus_encoder_ != nullptr && task->pcm.size() == encoder_frame_size_) {
|
||||
std::vector<uint8_t> buf(encoder_outbuf_size_);
|
||||
esp_audio_enc_in_frame_t in = {
|
||||
.buffer = (uint8_t *)(task->pcm.data()),
|
||||
.len = (uint32_t)(encoder_frame_size_ * sizeof(int16_t)),
|
||||
};
|
||||
esp_audio_enc_out_frame_t out = {
|
||||
.buffer = buf.data(),
|
||||
.len = (uint32_t)encoder_outbuf_size_,
|
||||
.encoded_bytes = 0,
|
||||
};
|
||||
auto ret = esp_opus_enc_process(opus_encoder_, &in, &out);
|
||||
if (ret == ESP_AUDIO_ERR_OK) {
|
||||
packet->payload.assign(buf.data(), buf.data() + out.encoded_bytes);
|
||||
|
||||
if (task->type == kAudioTaskTypeEncodeToSendQueue) {
|
||||
{
|
||||
std::lock_guard<std::mutex> lock2(audio_queue_mutex_);
|
||||
audio_send_queue_.push_back(std::move(packet));
|
||||
}
|
||||
if (callbacks_.on_send_queue_available) {
|
||||
callbacks_.on_send_queue_available();
|
||||
}
|
||||
} else if (task->type == kAudioTaskTypeEncodeToTestingQueue) {
|
||||
std::lock_guard<std::mutex> lock2(audio_queue_mutex_);
|
||||
audio_testing_queue_.push_back(std::move(packet));
|
||||
}
|
||||
debug_statistics_.encode_count++;
|
||||
} else {
|
||||
ESP_LOGE(TAG, "Failed to encode audio, error code: %d", ret);
|
||||
}
|
||||
if (callbacks_.on_send_queue_available) {
|
||||
callbacks_.on_send_queue_available();
|
||||
}
|
||||
} else if (task->type == kAudioTaskTypeEncodeToTestingQueue) {
|
||||
std::lock_guard<std::mutex> lock(audio_queue_mutex_);
|
||||
audio_testing_queue_.push_back(std::move(packet));
|
||||
} else {
|
||||
ESP_LOGE(TAG, "Failed to encode audio: encoder not configured or invalid frame size (got %u, expected %u)",
|
||||
task->pcm.size(), encoder_frame_size_);
|
||||
}
|
||||
debug_statistics_.encode_count++;
|
||||
lock.lock();
|
||||
}
|
||||
}
|
||||
@@ -372,17 +453,38 @@ void AudioService::OpusCodecTask() {
|
||||
}
|
||||
|
||||
void AudioService::SetDecodeSampleRate(int sample_rate, int frame_duration) {
|
||||
if (opus_decoder_->sample_rate() == sample_rate && opus_decoder_->duration_ms() == frame_duration) {
|
||||
if (decoder_sample_rate_ == sample_rate && decoder_duration_ms_ == frame_duration) {
|
||||
return;
|
||||
}
|
||||
|
||||
opus_decoder_.reset();
|
||||
opus_decoder_ = std::make_unique<OpusDecoderWrapper>(sample_rate, 1, frame_duration);
|
||||
std::unique_lock<std::mutex> decoder_lock(decoder_mutex_);
|
||||
if (opus_decoder_ != nullptr) {
|
||||
esp_opus_dec_close(opus_decoder_);
|
||||
opus_decoder_ = nullptr;
|
||||
}
|
||||
decoder_lock.unlock();
|
||||
esp_opus_dec_cfg_t opus_dec_cfg = OPUS_DEC_CFG(sample_rate, frame_duration);
|
||||
auto ret = esp_opus_dec_open(&opus_dec_cfg, sizeof(esp_opus_dec_cfg_t), &opus_decoder_);
|
||||
if (opus_decoder_ == nullptr) {
|
||||
ESP_LOGE(TAG, "Failed to create audio decoder, error code: %d", ret);
|
||||
return;
|
||||
}
|
||||
decoder_sample_rate_ = sample_rate;
|
||||
decoder_duration_ms_ = frame_duration;
|
||||
decoder_frame_size_ = decoder_sample_rate_ / 1000 * frame_duration;
|
||||
|
||||
auto codec = Board::GetInstance().GetAudioCodec();
|
||||
if (opus_decoder_->sample_rate() != codec->output_sample_rate()) {
|
||||
ESP_LOGI(TAG, "Resampling audio from %d to %d", opus_decoder_->sample_rate(), codec->output_sample_rate());
|
||||
output_resampler_.Configure(opus_decoder_->sample_rate(), codec->output_sample_rate());
|
||||
if (decoder_sample_rate_ != codec->output_sample_rate()) {
|
||||
ESP_LOGI(TAG, "Resampling audio from %d to %d", decoder_sample_rate_, codec->output_sample_rate());
|
||||
if (output_resampler_ != nullptr) {
|
||||
esp_ae_rate_cvt_close(output_resampler_);
|
||||
output_resampler_ = nullptr;
|
||||
}
|
||||
esp_ae_rate_cvt_cfg_t output_resampler_cfg = RATE_CVT_CFG(
|
||||
decoder_sample_rate_, codec->output_sample_rate(), ESP_AUDIO_MONO);
|
||||
auto resampler_ret = esp_ae_rate_cvt_open(&output_resampler_cfg, &output_resampler_);
|
||||
if (output_resampler_ == nullptr) {
|
||||
ESP_LOGE(TAG, "Failed to create output resampler, error code: %d", resampler_ret);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -390,7 +492,6 @@ void AudioService::PushTaskToEncodeQueue(AudioTaskType type, std::vector<int16_t
|
||||
auto task = std::make_unique<AudioTask>();
|
||||
task->type = type;
|
||||
task->pcm = std::move(pcm);
|
||||
|
||||
/* Push the task to the encode queue */
|
||||
std::unique_lock<std::mutex> lock(audio_queue_mutex_);
|
||||
|
||||
@@ -580,18 +681,16 @@ void AudioService::PlaySound(const std::string_view& ogg) {
|
||||
// 解析OpusHead包
|
||||
if (pkt_len >= 19 && std::memcmp(pkt_ptr, "OpusHead", 8) == 0) {
|
||||
seen_head = true;
|
||||
|
||||
// OpusHead结构:[0-7] "OpusHead", [8] version, [9] channel_count, [10-11] pre_skip
|
||||
// [12-15] input_sample_rate, [16-17] output_gain, [18] mapping_family
|
||||
if (pkt_len >= 12) {
|
||||
uint8_t version = pkt_ptr[8];
|
||||
uint8_t channel_count = pkt_ptr[9];
|
||||
|
||||
if (pkt_len >= 16) {
|
||||
// 读取输入采样率 (little-endian)
|
||||
sample_rate = pkt_ptr[12] | (pkt_ptr[13] << 8) |
|
||||
sample_rate = pkt_ptr[12] | (pkt_ptr[13] << 8) |
|
||||
(pkt_ptr[14] << 16) | (pkt_ptr[15] << 24);
|
||||
ESP_LOGI(TAG, "OpusHead: version=%d, channels=%d, sample_rate=%d",
|
||||
ESP_LOGI(TAG, "OpusHead: version=%d, channels=%d, sample_rate=%d",
|
||||
version, channel_count, sample_rate);
|
||||
}
|
||||
}
|
||||
@@ -626,7 +725,11 @@ bool AudioService::IsIdle() {
|
||||
|
||||
void AudioService::ResetDecoder() {
|
||||
std::lock_guard<std::mutex> lock(audio_queue_mutex_);
|
||||
opus_decoder_->ResetState();
|
||||
std::unique_lock<std::mutex> decoder_lock(decoder_mutex_);
|
||||
if (opus_decoder_ != nullptr) {
|
||||
esp_opus_dec_reset(opus_decoder_);
|
||||
}
|
||||
decoder_lock.unlock();
|
||||
timestamp_queue_.clear();
|
||||
audio_decode_queue_.clear();
|
||||
audio_playback_queue_.clear();
|
||||
|
||||
@@ -12,10 +12,11 @@
|
||||
#include <freertos/event_groups.h>
|
||||
#include <esp_timer.h>
|
||||
#include <model_path.h>
|
||||
|
||||
#include <opus_encoder.h>
|
||||
#include <opus_decoder.h>
|
||||
#include <opus_resampler.h>
|
||||
#include "esp_audio_enc.h"
|
||||
#include "esp_opus_enc.h"
|
||||
#include "esp_opus_dec.h"
|
||||
#include "esp_ae_rate_cvt.h"
|
||||
#include "esp_audio_types.h"
|
||||
|
||||
#include "audio_codec.h"
|
||||
#include "audio_processor.h"
|
||||
@@ -46,12 +47,34 @@
|
||||
#define AUDIO_POWER_TIMEOUT_MS 15000
|
||||
#define AUDIO_POWER_CHECK_INTERVAL_MS 1000
|
||||
|
||||
|
||||
#define AS_EVENT_AUDIO_TESTING_RUNNING (1 << 0)
|
||||
#define AS_EVENT_WAKE_WORD_RUNNING (1 << 1)
|
||||
#define AS_EVENT_AUDIO_PROCESSOR_RUNNING (1 << 2)
|
||||
#define AS_EVENT_PLAYBACK_NOT_EMPTY (1 << 3)
|
||||
|
||||
#define AS_OPUS_GET_FRAME_DRU_ENUM(duration_ms) \
|
||||
((duration_ms) == 5 ? ESP_OPUS_ENC_FRAME_DURATION_5_MS : \
|
||||
(duration_ms) == 10 ? ESP_OPUS_ENC_FRAME_DURATION_10_MS : \
|
||||
(duration_ms) == 20 ? ESP_OPUS_ENC_FRAME_DURATION_20_MS : \
|
||||
(duration_ms) == 40 ? ESP_OPUS_ENC_FRAME_DURATION_40_MS : \
|
||||
(duration_ms) == 60 ? ESP_OPUS_ENC_FRAME_DURATION_60_MS : \
|
||||
(duration_ms) == 80 ? ESP_OPUS_ENC_FRAME_DURATION_80_MS : \
|
||||
(duration_ms) == 100 ? ESP_OPUS_ENC_FRAME_DURATION_100_MS : \
|
||||
(duration_ms) == 120 ? ESP_OPUS_ENC_FRAME_DURATION_120_MS : -1)
|
||||
|
||||
#define AS_OPUS_ENC_CONFIG() { \
|
||||
.sample_rate = ESP_AUDIO_SAMPLE_RATE_16K, \
|
||||
.channel = ESP_AUDIO_MONO, \
|
||||
.bits_per_sample = ESP_AUDIO_BIT16, \
|
||||
.bitrate = ESP_OPUS_BITRATE_AUTO, \
|
||||
.frame_duration = (esp_opus_enc_frame_duration_t)AS_OPUS_GET_FRAME_DRU_ENUM(OPUS_FRAME_DURATION_MS), \
|
||||
.application_mode = ESP_OPUS_ENC_APPLICATION_AUDIO, \
|
||||
.complexity = 0, \
|
||||
.enable_fec = false, \
|
||||
.enable_dtx = true, \
|
||||
.enable_vbr = true, \
|
||||
}
|
||||
|
||||
struct AudioServiceCallbacks {
|
||||
std::function<void(void)> on_send_queue_available;
|
||||
std::function<void(const std::string&)> on_wake_word_detected;
|
||||
@@ -116,11 +139,20 @@ private:
|
||||
std::unique_ptr<AudioProcessor> audio_processor_;
|
||||
std::unique_ptr<WakeWord> wake_word_;
|
||||
std::unique_ptr<AudioDebugger> audio_debugger_;
|
||||
std::unique_ptr<OpusEncoderWrapper> opus_encoder_;
|
||||
std::unique_ptr<OpusDecoderWrapper> opus_decoder_;
|
||||
OpusResampler input_resampler_;
|
||||
OpusResampler reference_resampler_;
|
||||
OpusResampler output_resampler_;
|
||||
void* opus_encoder_ = nullptr;
|
||||
void* opus_decoder_ = nullptr;
|
||||
std::mutex decoder_mutex_;
|
||||
esp_ae_rate_cvt_handle_t input_resampler_ = nullptr;
|
||||
esp_ae_rate_cvt_handle_t output_resampler_ = nullptr;
|
||||
|
||||
// Encoder/Decoder state
|
||||
int encoder_sample_rate_ = 16000;
|
||||
int encoder_duration_ms_ = OPUS_FRAME_DURATION_MS;
|
||||
int encoder_frame_size_ = 0;
|
||||
int encoder_outbuf_size_ = 0;
|
||||
int decoder_sample_rate_ = 0;
|
||||
int decoder_duration_ms_ = OPUS_FRAME_DURATION_MS;
|
||||
int decoder_frame_size_ = 0;
|
||||
DebugStatistics debug_statistics_;
|
||||
srmodel_list_t* models_list_ = nullptr;
|
||||
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
#include "afe_wake_word.h"
|
||||
#include "audio_service.h"
|
||||
|
||||
#include <esp_log.h>
|
||||
#include <sstream>
|
||||
|
||||
@@ -157,7 +156,7 @@ void AfeWakeWord::StoreWakeWordData(const int16_t* data, size_t samples) {
|
||||
}
|
||||
|
||||
void AfeWakeWord::EncodeWakeWordData() {
|
||||
const size_t stack_size = 4096 * 7;
|
||||
const size_t stack_size = 4096 * 6;
|
||||
wake_word_opus_.clear();
|
||||
if (wake_word_encode_task_stack_ == nullptr) {
|
||||
wake_word_encode_task_stack_ = (StackType_t*)heap_caps_malloc(stack_size, MALLOC_CAP_SPIRAM);
|
||||
@@ -172,20 +171,62 @@ void AfeWakeWord::EncodeWakeWordData() {
|
||||
auto this_ = (AfeWakeWord*)arg;
|
||||
{
|
||||
auto start_time = esp_timer_get_time();
|
||||
auto encoder = std::make_unique<OpusEncoderWrapper>(16000, 1, OPUS_FRAME_DURATION_MS);
|
||||
encoder->SetComplexity(0); // 0 is the fastest
|
||||
|
||||
// Create encoder
|
||||
esp_opus_enc_config_t opus_enc_cfg = AS_OPUS_ENC_CONFIG();
|
||||
void* encoder_handle = nullptr;
|
||||
auto ret = esp_opus_enc_open(&opus_enc_cfg, sizeof(esp_opus_enc_config_t), &encoder_handle);
|
||||
if (encoder_handle == nullptr) {
|
||||
ESP_LOGE(TAG, "Failed to create audio encoder, error code: %d", ret);
|
||||
std::lock_guard<std::mutex> lock(this_->wake_word_mutex_);
|
||||
this_->wake_word_opus_.push_back(std::vector<uint8_t>());
|
||||
this_->wake_word_cv_.notify_all();
|
||||
return;
|
||||
}
|
||||
|
||||
// Get frame size
|
||||
int frame_size = 0;
|
||||
int outbuf_size = 0;
|
||||
esp_opus_enc_get_frame_size(encoder_handle, &frame_size, &outbuf_size);
|
||||
frame_size = frame_size / sizeof(int16_t);
|
||||
|
||||
// Encode all PCM data
|
||||
int packets = 0;
|
||||
std::vector<int16_t> in_buffer;
|
||||
esp_audio_enc_in_frame_t in = {};
|
||||
esp_audio_enc_out_frame_t out = {};
|
||||
|
||||
for (auto& pcm: this_->wake_word_pcm_) {
|
||||
encoder->Encode(std::move(pcm), [this_](std::vector<uint8_t>&& opus) {
|
||||
std::lock_guard<std::mutex> lock(this_->wake_word_mutex_);
|
||||
this_->wake_word_opus_.emplace_back(std::move(opus));
|
||||
this_->wake_word_cv_.notify_all();
|
||||
});
|
||||
packets++;
|
||||
if (in_buffer.empty()) {
|
||||
in_buffer = std::move(pcm);
|
||||
} else {
|
||||
in_buffer.reserve(in_buffer.size() + pcm.size());
|
||||
in_buffer.insert(in_buffer.end(), pcm.begin(), pcm.end());
|
||||
}
|
||||
|
||||
while (in_buffer.size() >= frame_size) {
|
||||
std::vector<uint8_t> opus_buf(outbuf_size);
|
||||
in.buffer = (uint8_t *)(in_buffer.data());
|
||||
in.len = (uint32_t)(frame_size * sizeof(int16_t));
|
||||
out.buffer = opus_buf.data();
|
||||
out.len = outbuf_size;
|
||||
out.encoded_bytes = 0;
|
||||
|
||||
ret = esp_opus_enc_process(encoder_handle, &in, &out);
|
||||
if (ret == ESP_AUDIO_ERR_OK) {
|
||||
std::lock_guard<std::mutex> lock(this_->wake_word_mutex_);
|
||||
this_->wake_word_opus_.emplace_back(opus_buf.data(), opus_buf.data() + out.encoded_bytes);
|
||||
this_->wake_word_cv_.notify_all();
|
||||
packets++;
|
||||
} else {
|
||||
ESP_LOGE(TAG, "Failed to encode audio, error code: %d", ret);
|
||||
}
|
||||
|
||||
in_buffer.erase(in_buffer.begin(), in_buffer.begin() + frame_size);
|
||||
}
|
||||
}
|
||||
this_->wake_word_pcm_.clear();
|
||||
|
||||
// Close encoder
|
||||
esp_opus_enc_close(encoder_handle);
|
||||
auto end_time = esp_timer_get_time();
|
||||
ESP_LOGI(TAG, "Encode wake word opus %d packets in %ld ms", packets, (long)((end_time - start_time) / 1000));
|
||||
|
||||
|
||||
@@ -9,10 +9,8 @@
|
||||
#include <esp_mn_speech_commands.h>
|
||||
#include <cJSON.h>
|
||||
|
||||
|
||||
#define TAG "CustomWakeWord"
|
||||
|
||||
|
||||
CustomWakeWord::CustomWakeWord()
|
||||
: wake_word_pcm_(), wake_word_opus_() {
|
||||
}
|
||||
@@ -218,20 +216,56 @@ void CustomWakeWord::EncodeWakeWordData() {
|
||||
auto this_ = (CustomWakeWord*)arg;
|
||||
{
|
||||
auto start_time = esp_timer_get_time();
|
||||
auto encoder = std::make_unique<OpusEncoderWrapper>(16000, 1, OPUS_FRAME_DURATION_MS);
|
||||
encoder->SetComplexity(0); // 0 is the fastest
|
||||
|
||||
// Create encoder
|
||||
esp_opus_enc_config_t opus_enc_cfg = AS_OPUS_ENC_CONFIG();
|
||||
void* encoder_handle = nullptr;
|
||||
auto ret = esp_opus_enc_open(&opus_enc_cfg, sizeof(esp_opus_enc_config_t), &encoder_handle);
|
||||
if (encoder_handle == nullptr) {
|
||||
ESP_LOGE(TAG, "Failed to create audio encoder, error code: %d", ret);
|
||||
std::lock_guard<std::mutex> lock(this_->wake_word_mutex_);
|
||||
this_->wake_word_opus_.push_back(std::vector<uint8_t>());
|
||||
this_->wake_word_cv_.notify_all();
|
||||
return;
|
||||
}
|
||||
// Get frame size
|
||||
int frame_size = 0;
|
||||
int outbuf_size = 0;
|
||||
esp_opus_enc_get_frame_size(encoder_handle, &frame_size, &outbuf_size);
|
||||
frame_size = frame_size / sizeof(int16_t);
|
||||
// Encode all PCM data
|
||||
int packets = 0;
|
||||
std::vector<int16_t> in_buffer;
|
||||
esp_audio_enc_in_frame_t in = {};
|
||||
esp_audio_enc_out_frame_t out = {};
|
||||
for (auto& pcm: this_->wake_word_pcm_) {
|
||||
encoder->Encode(std::move(pcm), [this_](std::vector<uint8_t>&& opus) {
|
||||
std::lock_guard<std::mutex> lock(this_->wake_word_mutex_);
|
||||
this_->wake_word_opus_.emplace_back(std::move(opus));
|
||||
this_->wake_word_cv_.notify_all();
|
||||
});
|
||||
packets++;
|
||||
if (in_buffer.empty()) {
|
||||
in_buffer = std::move(pcm);
|
||||
} else {
|
||||
in_buffer.reserve(in_buffer.size() + pcm.size());
|
||||
in_buffer.insert(in_buffer.end(), pcm.begin(), pcm.end());
|
||||
}
|
||||
while (in_buffer.size() >= frame_size) {
|
||||
std::vector<uint8_t> opus_buf(outbuf_size);
|
||||
in.buffer = (uint8_t *)(in_buffer.data());
|
||||
in.len = (uint32_t)(frame_size * sizeof(int16_t));
|
||||
out.buffer = opus_buf.data();
|
||||
out.len = outbuf_size;
|
||||
out.encoded_bytes = 0;
|
||||
ret = esp_opus_enc_process(encoder_handle, &in, &out);
|
||||
if (ret == ESP_AUDIO_ERR_OK) {
|
||||
std::lock_guard<std::mutex> lock(this_->wake_word_mutex_);
|
||||
this_->wake_word_opus_.emplace_back(opus_buf.data(), opus_buf.data() + out.encoded_bytes);
|
||||
this_->wake_word_cv_.notify_all();
|
||||
packets++;
|
||||
} else {
|
||||
ESP_LOGE(TAG, "Failed to encode audio, error code: %d", ret);
|
||||
}
|
||||
in_buffer.erase(in_buffer.begin(), in_buffer.begin() + frame_size);
|
||||
}
|
||||
}
|
||||
this_->wake_word_pcm_.clear();
|
||||
|
||||
// Close encoder
|
||||
esp_opus_enc_close(encoder_handle);
|
||||
auto end_time = esp_timer_get_time();
|
||||
ESP_LOGI(TAG, "Encode wake word opus %d packets in %ld ms", packets, (long)((end_time - start_time) / 1000));
|
||||
|
||||
|
||||
@@ -9,7 +9,6 @@
|
||||
#include <freertos/FreeRTOS.h>
|
||||
#include <freertos/task.h>
|
||||
#include <font_awesome.h>
|
||||
#include <opus_encoder.h>
|
||||
#include <utility>
|
||||
|
||||
static const char *TAG = "Ml307Board";
|
||||
|
||||
@@ -19,7 +19,8 @@ dependencies:
|
||||
espressif/esp_lcd_panel_io_additions: ^1.0.1
|
||||
78/esp_lcd_nv3023: ~1.0.0
|
||||
78/esp-wifi-connect: ~3.0.2
|
||||
78/esp-opus-encoder: ~2.4.1
|
||||
espressif/esp_audio_effects: ~1.2.0
|
||||
espressif/esp_audio_codec: ~2.4.0
|
||||
78/esp-ml307: ~3.5.3
|
||||
78/xiaozhi-fonts: ~1.5.5
|
||||
espressif/led_strip: ~3.0.1
|
||||
|
||||
@@ -278,7 +278,7 @@ if __name__ == "__main__":
|
||||
|
||||
# Compile mode
|
||||
board_type_input: str = args.board
|
||||
name_filter: str | None = args.name
|
||||
name_filter: Optional[str] = args.name
|
||||
|
||||
# Check board_type in CMakeLists
|
||||
if board_type_input != "all" and not _board_type_exists(board_type_input):
|
||||
|
||||
Reference in New Issue
Block a user