ESP-BOX-3 / LichuangDev enable AEC to support realtime chat (#429)

* read frame duration from server * fit wechat style emoji size * Make Wechat UI look better * Add Realtime Chat to ESP-BOX-3/LichuangDev * disable debug log * Fix Sh1106 Compile Error Bug. IDF 5.3.2 Not supporting sh1106-esp-idf. (#424) * Fix ESP32 Board Led No Light Bug (#427) * add board esp32-s3-touch-lcd-3.5 (#415) * add board esp32-s3-touch-lcd-3.5 * add axp2101 --------- Co-authored-by: flyingtjy <flyingtjy@gmail.com> --------- Co-authored-by: ooxxU <71391474@qq.com> Co-authored-by: flying1425 <79792003+flying1425@users.noreply.github.com> Co-authored-by: flyingtjy <flyingtjy@gmail.com>
2025-03-30 09:07:08 +08:00
parent e4c76eaa46
commit fa899a310e
20 changed files with 271 additions and 243 deletions
--- a/main/application.cc
+++ b/main/application.cc
@@ -203,6 +203,7 @@ void Application::Alert(const char* status, const char* message, const char* emo
    display->SetEmotion(emotion);
    display->SetChatMessage("system", message);
    if (!sound.empty()) {
+        ResetDecoder();
        PlaySound(sound);
    }
 }
@@ -217,9 +218,8 @@ void Application::DismissAlert() {
 }

 void Application::PlaySound(const std::string_view& sound) {
-    auto codec = Board::GetInstance().GetAudioCodec();
-    codec->EnableOutput(true);
-    SetDecodeSampleRate(16000);
+    // The assets are encoded at 16000Hz, 60ms frame duration
+    SetDecodeSampleRate(16000, 60);
    const char* data = sound.data();
    size_t size = sound.size();
    for (const char* p = data; p < data + size; ) {
@@ -255,9 +255,7 @@ void Application::ToggleChatState() {
                return;
            }

-            keep_listening_ = true;
-            protocol_->SendStartListening(kListeningModeAutoStop);
-            SetDeviceState(kDeviceStateListening);
+            SetListeningMode(realtime_chat_enabled_ ? kListeningModeRealtime : kListeningModeAutoStop);
        });
    } else if (device_state_ == kDeviceStateSpeaking) {
        Schedule([this]() {
@@ -281,7 +279,6 @@ void Application::StartListening() {
        return;
    }
    
-    keep_listening_ = false;
    if (device_state_ == kDeviceStateIdle) {
        Schedule([this]() {
            if (!protocol_->IsAudioChannelOpened()) {
@@ -290,14 +287,13 @@ void Application::StartListening() {
                    return;
                }
            }
-            protocol_->SendStartListening(kListeningModeManualStop);
-            SetDeviceState(kDeviceStateListening);
+
+            SetListeningMode(kListeningModeManualStop);
        });
    } else if (device_state_ == kDeviceStateSpeaking) {
        Schedule([this]() {
            AbortSpeaking(kAbortReasonNone);
-            protocol_->SendStartListening(kListeningModeManualStop);
-            SetDeviceState(kDeviceStateListening);
+            SetListeningMode(kListeningModeManualStop);
        });
    }
 }
@@ -320,12 +316,12 @@ void Application::Start() {

    /* Setup the audio codec */
    auto codec = board.GetAudioCodec();
-    opus_decode_sample_rate_ = codec->output_sample_rate();
-    opus_decoder_ = std::make_unique<OpusDecoderWrapper>(opus_decode_sample_rate_, 1);
+    opus_decoder_ = std::make_unique<OpusDecoderWrapper>(codec->output_sample_rate(), 1, OPUS_FRAME_DURATION_MS);
    opus_encoder_ = std::make_unique<OpusEncoderWrapper>(16000, 1, OPUS_FRAME_DURATION_MS);
-    // For ML307 boards, we use complexity 5 to save bandwidth
-    // For other boards, we use complexity 3 to save CPU
-    if (board.GetBoardType() == "ml307") {
+    if (realtime_chat_enabled_) {
+        ESP_LOGI(TAG, "Realtime chat enabled, setting opus encoder complexity to 0");
+        opus_encoder_->SetComplexity(0);
+    } else if (board.GetBoardType() == "ml307") {
        ESP_LOGI(TAG, "ML307 board detected, setting opus encoder complexity to 5");
        opus_encoder_->SetComplexity(5);
    } else {
@@ -337,24 +333,20 @@ void Application::Start() {
        input_resampler_.Configure(codec->input_sample_rate(), 16000);
        reference_resampler_.Configure(codec->input_sample_rate(), 16000);
    }
-    codec->OnInputReady([this, codec]() {
-        BaseType_t higher_priority_task_woken = pdFALSE;
-        xEventGroupSetBitsFromISR(event_group_, AUDIO_INPUT_READY_EVENT, &higher_priority_task_woken);
-        return higher_priority_task_woken == pdTRUE;
-    });
-    codec->OnOutputReady([this]() {
-        BaseType_t higher_priority_task_woken = pdFALSE;
-        xEventGroupSetBitsFromISR(event_group_, AUDIO_OUTPUT_READY_EVENT, &higher_priority_task_woken);
-        return higher_priority_task_woken == pdTRUE;
-    });
    codec->Start();

+    xTaskCreatePinnedToCore([](void* arg) {
+        Application* app = (Application*)arg;
+        app->AudioLoop();
+        vTaskDelete(NULL);
+    }, "audio_loop", 4096 * 2, this, 8, &audio_loop_task_handle_, realtime_chat_enabled_ ? 1 : 0);
+
    /* Start the main loop */
-    xTaskCreate([](void* arg) {
+    xTaskCreatePinnedToCore([](void* arg) {
        Application* app = (Application*)arg;
        app->MainLoop();
        vTaskDelete(NULL);
-    }, "main_loop", 4096 * 2, this, 4, nullptr);
+    }, "main_loop", 4096, this, 4, &main_loop_task_handle_, 0);

    /* Wait for the network to be ready */
    board.StartNetwork();
@@ -372,9 +364,7 @@ void Application::Start() {
    });
    protocol_->OnIncomingAudio([this](std::vector<uint8_t>&& data) {
        std::lock_guard<std::mutex> lock(mutex_);
-        if (device_state_ == kDeviceStateSpeaking) {
-            audio_decode_queue_.emplace_back(std::move(data));
-        }
+        audio_decode_queue_.emplace_back(std::move(data));
    });
    protocol_->OnAudioChannelOpened([this, codec, &board]() {
        board.SetPowerSaveMode(false);
@@ -382,7 +372,7 @@ void Application::Start() {
            ESP_LOGW(TAG, "Server sample rate %d does not match device output sample rate %d, resampling may cause distortion",
                protocol_->server_sample_rate(), codec->output_sample_rate());
        }
-        SetDecodeSampleRate(protocol_->server_sample_rate());
+        SetDecodeSampleRate(protocol_->server_sample_rate(), protocol_->server_frame_duration());
        auto& thing_manager = iot::ThingManager::GetInstance();
        protocol_->SendIotDescriptors(thing_manager.GetDescriptorsJson());
        std::string states;
@@ -412,13 +402,12 @@ void Application::Start() {
                });
            } else if (strcmp(state->valuestring, "stop") == 0) {
                Schedule([this]() {
+                    background_task_->WaitForCompletion();
                    if (device_state_ == kDeviceStateSpeaking) {
-                        background_task_->WaitForCompletion();
-                        if (keep_listening_) {
-                            protocol_->SendStartListening(kListeningModeAutoStop);
-                            SetDeviceState(kDeviceStateListening);
-                        } else {
+                        if (listening_mode_ == kListeningModeManualStop) {
                            SetDeviceState(kDeviceStateIdle);
+                        } else {
+                            SetDeviceState(kDeviceStateListening);
                        }
                    }
                });
@@ -474,7 +463,7 @@ void Application::Start() {
    }, "check_new_version", 4096 * 2, this, 2, nullptr);

 #if CONFIG_USE_AUDIO_PROCESSOR
-    audio_processor_.Initialize(codec->input_channels(), codec->input_reference());
+    audio_processor_.Initialize(codec, realtime_chat_enabled_);
    audio_processor_.OnOutput([this](std::vector<int16_t>&& data) {
        background_task_->Schedule([this, data = std::move(data)]() mutable {
            opus_encoder_->Encode(std::move(data), [this](std::vector<uint8_t>&& opus) {
@@ -500,7 +489,7 @@ void Application::Start() {
 #endif

 #if CONFIG_USE_WAKE_WORD_DETECT
-    wake_word_detect_.Initialize(codec->input_channels(), codec->input_reference());
+    wake_word_detect_.Initialize(codec);
    wake_word_detect_.OnWakeWordDetected([this](const std::string& wake_word) {
        Schedule([this, &wake_word]() {
            if (device_state_ == kDeviceStateIdle) {
@@ -520,8 +509,7 @@ void Application::Start() {
                // Set the chat state to wake word detected
                protocol_->SendWakeWordDetected(wake_word);
                ESP_LOGI(TAG, "Wake word detected: %s", wake_word.c_str());
-                keep_listening_ = true;
-                SetDeviceState(kDeviceStateIdle);
+                SetListeningMode(realtime_chat_enabled_ ? kListeningModeRealtime : kListeningModeAutoStop);
            } else if (device_state_ == kDeviceStateSpeaking) {
                AbortSpeaking(kAbortReasonWakeWordDetected);
            } else if (device_state_ == kDeviceStateActivating) {
@@ -534,6 +522,13 @@ void Application::Start() {

    SetDeviceState(kDeviceStateIdle);
    esp_timer_start_periodic(clock_timer_handle_, 1000000);
+
+#if 0
+    while (true) {
+        SystemInfo::PrintRealTimeStats(pdMS_TO_TICKS(1000));
+        vTaskDelay(pdMS_TO_TICKS(10000));
+    }
+#endif
 }

 void Application::OnClockTimer() {
@@ -541,7 +536,6 @@ void Application::OnClockTimer() {

    // Print the debug info every 10 seconds
    if (clock_ticks_ % 10 == 0) {
-        // SystemInfo::PrintRealTimeStats(pdMS_TO_TICKS(1000));
        int free_sram = heap_caps_get_free_size(MALLOC_CAP_INTERNAL);
        int min_free_sram = heap_caps_get_minimum_free_size(MALLOC_CAP_INTERNAL);
        ESP_LOGI(TAG, "Free internal: %u minimal internal: %u", free_sram, min_free_sram);
@@ -561,6 +555,7 @@ void Application::OnClockTimer() {
    }
 }

+// Add a async task to MainLoop
 void Application::Schedule(std::function<void()> callback) {
    {
        std::lock_guard<std::mutex> lock(mutex_);
@@ -574,16 +569,8 @@ void Application::Schedule(std::function<void()> callback) {
 // they should use Schedule to call this function
 void Application::MainLoop() {
    while (true) {
-        auto bits = xEventGroupWaitBits(event_group_,
-            SCHEDULE_EVENT | AUDIO_INPUT_READY_EVENT | AUDIO_OUTPUT_READY_EVENT,
-            pdTRUE, pdFALSE, portMAX_DELAY);
+        auto bits = xEventGroupWaitBits(event_group_, SCHEDULE_EVENT, pdTRUE, pdFALSE, portMAX_DELAY);

-        if (bits & AUDIO_INPUT_READY_EVENT) {
-            InputAudio();
-        }
-        if (bits & AUDIO_OUTPUT_READY_EVENT) {
-            OutputAudio();
-        }
        if (bits & SCHEDULE_EVENT) {
            std::unique_lock<std::mutex> lock(mutex_);
            std::list<std::function<void()>> tasks = std::move(main_tasks_);
@@ -595,14 +582,18 @@ void Application::MainLoop() {
    }
 }

-void Application::ResetDecoder() {
-    std::lock_guard<std::mutex> lock(mutex_);
-    opus_decoder_->ResetState();
-    audio_decode_queue_.clear();
-    last_output_time_ = std::chrono::steady_clock::now();
+// The Audio Loop is used to input and output audio data
+void Application::AudioLoop() {
+    auto codec = Board::GetInstance().GetAudioCodec();
+    while (true) {
+        OnAudioInput();
+        if (codec->output_enabled()) {
+            OnAudioOutput();
+        }
+    }
 }

-void Application::OutputAudio() {
+void Application::OnAudioOutput() {
    auto now = std::chrono::steady_clock::now();
    auto codec = Board::GetInstance().GetAudioCodec();
    const int max_silence_seconds = 10;
@@ -624,7 +615,6 @@ void Application::OutputAudio() {
        return;
    }

-    last_output_time_ = now;
    auto opus = std::move(audio_decode_queue_.front());
    audio_decode_queue_.pop_front();
    lock.unlock();
@@ -638,27 +628,57 @@ void Application::OutputAudio() {
        if (!opus_decoder_->Decode(std::move(opus), pcm)) {
            return;
        }
-
        // Resample if the sample rate is different
-        if (opus_decode_sample_rate_ != codec->output_sample_rate()) {
+        if (opus_decoder_->sample_rate() != codec->output_sample_rate()) {
            int target_size = output_resampler_.GetOutputSamples(pcm.size());
            std::vector<int16_t> resampled(target_size);
            output_resampler_.Process(pcm.data(), pcm.size(), resampled.data());
            pcm = std::move(resampled);
        }
-        
        codec->OutputData(pcm);
+        last_output_time_ = std::chrono::steady_clock::now();
    });
 }

-void Application::InputAudio() {
-    auto codec = Board::GetInstance().GetAudioCodec();
+void Application::OnAudioInput() {
    std::vector<int16_t> data;
-    if (!codec->InputData(data)) {
+
+#if CONFIG_USE_WAKE_WORD_DETECT
+    if (wake_word_detect_.IsDetectionRunning()) {
+        ReadAudio(data, 16000, wake_word_detect_.GetFeedSize());
+        wake_word_detect_.Feed(data);
        return;
    }
+#endif
+#if CONFIG_USE_AUDIO_PROCESSOR
+    if (audio_processor_.IsRunning()) {
+        ReadAudio(data, 16000, audio_processor_.GetFeedSize());
+        audio_processor_.Feed(data);
+        return;
+    }
+#else
+    if (device_state_ == kDeviceStateListening) {
+        ReadAudio(data, 16000, 30 * 16000 / 1000);
+        background_task_->Schedule([this, data = std::move(data)]() mutable {
+            opus_encoder_->Encode(std::move(data), [this](std::vector<uint8_t>&& opus) {
+                Schedule([this, opus = std::move(opus)]() {
+                    protocol_->SendAudio(opus);
+                });
+            });
+        });
+        return;
+    }
+#endif
+    vTaskDelay(pdMS_TO_TICKS(30));
+}

-    if (codec->input_sample_rate() != 16000) {
+void Application::ReadAudio(std::vector<int16_t>& data, int sample_rate, int samples) {
+    auto codec = Board::GetInstance().GetAudioCodec();
+    if (codec->input_sample_rate() != sample_rate) {
+        data.resize(samples * codec->input_sample_rate() / sample_rate);
+        if (!codec->InputData(data)) {
+            return;
+        }
        if (codec->input_channels() == 2) {
            auto mic_channel = std::vector<int16_t>(data.size() / 2);
            auto reference_channel = std::vector<int16_t>(data.size() / 2);
@@ -680,28 +700,12 @@ void Application::InputAudio() {
            input_resampler_.Process(data.data(), data.size(), resampled.data());
            data = std::move(resampled);
        }
+    } else {
+        data.resize(samples);
+        if (!codec->InputData(data)) {
+            return;
+        }
    }
-
-#if CONFIG_USE_WAKE_WORD_DETECT
-    if (wake_word_detect_.IsDetectionRunning()) {
-        wake_word_detect_.Feed(data);
-    }
-#endif
-#if CONFIG_USE_AUDIO_PROCESSOR
-    if (audio_processor_.IsRunning()) {
-        audio_processor_.Input(data);
-    }
-#else
-    if (device_state_ == kDeviceStateListening) {
-        background_task_->Schedule([this, data = std::move(data)]() mutable {
-            opus_encoder_->Encode(std::move(data), [this](std::vector<uint8_t>&& opus) {
-                Schedule([this, opus = std::move(opus)]() {
-                    protocol_->SendAudio(opus);
-                });
-            });
-        });
-    }
-#endif
 }

 void Application::AbortSpeaking(AbortReason reason) {
@@ -710,6 +714,11 @@ void Application::AbortSpeaking(AbortReason reason) {
    protocol_->SendAbortSpeaking(reason);
 }

+void Application::SetListeningMode(ListeningMode mode) {
+    listening_mode_ = mode;
+    SetDeviceState(kDeviceStateListening);
+}
+
 void Application::SetDeviceState(DeviceState state) {
    if (device_state_ == state) {
        return;
@@ -723,7 +732,6 @@ void Application::SetDeviceState(DeviceState state) {
    background_task_->WaitForCompletion();

    auto& board = Board::GetInstance();
-    auto codec = board.GetAudioCodec();
    auto display = board.GetDisplay();
    auto led = board.GetLed();
    led->OnStateChanged();
@@ -747,30 +755,43 @@ void Application::SetDeviceState(DeviceState state) {
        case kDeviceStateListening:
            display->SetStatus(Lang::Strings::LISTENING);
            display->SetEmotion("neutral");
-            ResetDecoder();
-            opus_encoder_->ResetState();
-#if CONFIG_USE_AUDIO_PROCESSOR
-            audio_processor_.Start();
-#endif
-#if CONFIG_USE_WAKE_WORD_DETECT
-            wake_word_detect_.StopDetection();
-#endif
+
+            // Update the IoT states before sending the start listening command
            UpdateIotStates();
-            if (previous_state == kDeviceStateSpeaking) {
-                // FIXME: Wait for the speaker to empty the buffer
-                vTaskDelay(pdMS_TO_TICKS(120));
+
+            // Make sure the audio processor is running
+#if CONFIG_USE_AUDIO_PROCESSOR
+            if (!audio_processor_.IsRunning()) {
+#else
+            if (true) {
+#endif
+                // Send the start listening command
+                protocol_->SendStartListening(listening_mode_);
+                if (listening_mode_ == kListeningModeAutoStop && previous_state == kDeviceStateSpeaking) {
+                    // FIXME: Wait for the speaker to empty the buffer
+                    vTaskDelay(pdMS_TO_TICKS(120));
+                }
+                opus_encoder_->ResetState();
+#if CONFIG_USE_WAKE_WORD_DETECT
+                wake_word_detect_.StopDetection();
+#endif
+#if CONFIG_USE_AUDIO_PROCESSOR
+                audio_processor_.Start();
+#endif
            }
            break;
        case kDeviceStateSpeaking:
            display->SetStatus(Lang::Strings::SPEAKING);
-            ResetDecoder();
-            codec->EnableOutput(true);
+
+            if (listening_mode_ != kListeningModeRealtime) {
 #if CONFIG_USE_AUDIO_PROCESSOR
-            audio_processor_.Stop();
+                audio_processor_.Stop();
 #endif
 #if CONFIG_USE_WAKE_WORD_DETECT
-            wake_word_detect_.StartDetection();
+                wake_word_detect_.StartDetection();
 #endif
+            }
+            ResetDecoder();
            break;
        default:
            // Do nothing
@@ -778,19 +799,28 @@ void Application::SetDeviceState(DeviceState state) {
    }
 }

-void Application::SetDecodeSampleRate(int sample_rate) {
-    if (opus_decode_sample_rate_ == sample_rate) {
+void Application::ResetDecoder() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    opus_decoder_->ResetState();
+    audio_decode_queue_.clear();
+    last_output_time_ = std::chrono::steady_clock::now();
+    
+    auto codec = Board::GetInstance().GetAudioCodec();
+    codec->EnableOutput(true);
+}
+
+void Application::SetDecodeSampleRate(int sample_rate, int frame_duration) {
+    if (opus_decoder_->sample_rate() == sample_rate && opus_decoder_->duration_ms() == frame_duration) {
        return;
    }

-    opus_decode_sample_rate_ = sample_rate;
    opus_decoder_.reset();
-    opus_decoder_ = std::make_unique<OpusDecoderWrapper>(opus_decode_sample_rate_, 1);
+    opus_decoder_ = std::make_unique<OpusDecoderWrapper>(sample_rate, 1, frame_duration);

    auto codec = Board::GetInstance().GetAudioCodec();
-    if (opus_decode_sample_rate_ != codec->output_sample_rate()) {
-        ESP_LOGI(TAG, "Resampling audio from %d to %d", opus_decode_sample_rate_, codec->output_sample_rate());
-        output_resampler_.Configure(opus_decode_sample_rate_, codec->output_sample_rate());
+    if (opus_decoder_->sample_rate() != codec->output_sample_rate()) {
+        ESP_LOGI(TAG, "Resampling audio from %d to %d", opus_decoder_->sample_rate(), codec->output_sample_rate());
+        output_resampler_.Configure(opus_decoder_->sample_rate(), codec->output_sample_rate());
    }
 }