v1.8.0: Audio 代码重构与低功耗优化 (#943)

* Reconstruct Audio Code

* Remove old IoT implementation

* Add MQTT-UDP documentation

* OTA升级失败时,可以继续使用
This commit is contained in:
Xiaoxia
2025-07-19 22:45:22 +08:00
committed by GitHub
parent 0621578f55
commit 3c71558a5f
173 changed files with 2099 additions and 3265 deletions

View File

@@ -40,7 +40,7 @@ bool MqttProtocol::StartMqttClient(bool report_error) {
auto client_id = settings.GetString("client_id");
auto username = settings.GetString("username");
auto password = settings.GetString("password");
int keepalive_interval = settings.GetInt("keepalive", 120);
int keepalive_interval = settings.GetInt("keepalive", 240);
publish_topic_ = settings.GetString("publish_topic");
if (endpoint.empty()) {
@@ -121,25 +121,25 @@ bool MqttProtocol::SendText(const std::string& text) {
return true;
}
bool MqttProtocol::SendAudio(const AudioStreamPacket& packet) {
bool MqttProtocol::SendAudio(std::unique_ptr<AudioStreamPacket> packet) {
std::lock_guard<std::mutex> lock(channel_mutex_);
if (udp_ == nullptr) {
return false;
}
std::string nonce(aes_nonce_);
*(uint16_t*)&nonce[2] = htons(packet.payload.size());
*(uint32_t*)&nonce[8] = htonl(packet.timestamp);
*(uint16_t*)&nonce[2] = htons(packet->payload.size());
*(uint32_t*)&nonce[8] = htonl(packet->timestamp);
*(uint32_t*)&nonce[12] = htonl(++local_sequence_);
std::string encrypted;
encrypted.resize(aes_nonce_.size() + packet.payload.size());
encrypted.resize(aes_nonce_.size() + packet->payload.size());
memcpy(encrypted.data(), nonce.data(), nonce.size());
size_t nc_off = 0;
uint8_t stream_block[16] = {0};
if (mbedtls_aes_crypt_ctr(&aes_ctx_, packet.payload.size(), &nc_off, (uint8_t*)nonce.c_str(), stream_block,
(uint8_t*)packet.payload.data(), (uint8_t*)&encrypted[nonce.size()]) != 0) {
if (mbedtls_aes_crypt_ctr(&aes_ctx_, packet->payload.size(), &nc_off, (uint8_t*)nonce.c_str(), stream_block,
(uint8_t*)packet->payload.data(), (uint8_t*)&encrypted[nonce.size()]) != 0) {
ESP_LOGE(TAG, "Failed to encrypt audio data");
return false;
}
@@ -228,12 +228,12 @@ bool MqttProtocol::OpenAudioChannel() {
uint8_t stream_block[16] = {0};
auto nonce = (uint8_t*)data.data();
auto encrypted = (uint8_t*)data.data() + aes_nonce_.size();
AudioStreamPacket packet;
packet.sample_rate = server_sample_rate_;
packet.frame_duration = server_frame_duration_;
packet.timestamp = timestamp;
packet.payload.resize(decrypted_size);
int ret = mbedtls_aes_crypt_ctr(&aes_ctx_, decrypted_size, &nc_off, nonce, stream_block, encrypted, (uint8_t*)packet.payload.data());
auto packet = std::make_unique<AudioStreamPacket>();
packet->sample_rate = server_sample_rate_;
packet->frame_duration = server_frame_duration_;
packet->timestamp = timestamp;
packet->payload.resize(decrypted_size);
int ret = mbedtls_aes_crypt_ctr(&aes_ctx_, decrypted_size, &nc_off, nonce, stream_block, encrypted, (uint8_t*)packet->payload.data());
if (ret != 0) {
ESP_LOGE(TAG, "Failed to decrypt audio data, ret: %d", ret);
return;
@@ -263,9 +263,7 @@ std::string MqttProtocol::GetHelloMessage() {
#if CONFIG_USE_SERVER_AEC
cJSON_AddBoolToObject(features, "aec", true);
#endif
#if CONFIG_IOT_PROTOCOL_MCP
cJSON_AddBoolToObject(features, "mcp", true);
#endif
cJSON_AddItemToObject(root, "features", features);
cJSON* audio_params = cJSON_CreateObject();
cJSON_AddStringToObject(audio_params, "format", "opus");

View File

@@ -26,7 +26,7 @@ public:
~MqttProtocol();
bool Start() override;
bool SendAudio(const AudioStreamPacket& packet) override;
bool SendAudio(std::unique_ptr<AudioStreamPacket> packet) override;
bool OpenAudioChannel() override;
void CloseAudioChannel() override;
bool IsAudioChannelOpened() const override;

View File

@@ -8,7 +8,7 @@ void Protocol::OnIncomingJson(std::function<void(const cJSON* root)> callback) {
on_incoming_json_ = callback;
}
void Protocol::OnIncomingAudio(std::function<void(AudioStreamPacket&& packet)> callback) {
void Protocol::OnIncomingAudio(std::function<void(std::unique_ptr<AudioStreamPacket> packet)> callback) {
on_incoming_audio_ = callback;
}
@@ -65,56 +65,6 @@ void Protocol::SendStopListening() {
SendText(message);
}
void Protocol::SendIotDescriptors(const std::string& descriptors) {
cJSON* root = cJSON_Parse(descriptors.c_str());
if (root == nullptr) {
ESP_LOGE(TAG, "Failed to parse IoT descriptors: %s", descriptors.c_str());
return;
}
if (!cJSON_IsArray(root)) {
ESP_LOGE(TAG, "IoT descriptors should be an array");
cJSON_Delete(root);
return;
}
int arraySize = cJSON_GetArraySize(root);
for (int i = 0; i < arraySize; ++i) {
cJSON* descriptor = cJSON_GetArrayItem(root, i);
if (descriptor == nullptr) {
ESP_LOGE(TAG, "Failed to get IoT descriptor at index %d", i);
continue;
}
cJSON* messageRoot = cJSON_CreateObject();
cJSON_AddStringToObject(messageRoot, "session_id", session_id_.c_str());
cJSON_AddStringToObject(messageRoot, "type", "iot");
cJSON_AddBoolToObject(messageRoot, "update", true);
cJSON* descriptorArray = cJSON_CreateArray();
cJSON_AddItemToArray(descriptorArray, cJSON_Duplicate(descriptor, 1));
cJSON_AddItemToObject(messageRoot, "descriptors", descriptorArray);
char* message = cJSON_PrintUnformatted(messageRoot);
if (message == nullptr) {
ESP_LOGE(TAG, "Failed to print JSON message for IoT descriptor at index %d", i);
cJSON_Delete(messageRoot);
continue;
}
SendText(std::string(message));
cJSON_free(message);
cJSON_Delete(messageRoot);
}
cJSON_Delete(root);
}
void Protocol::SendIotStates(const std::string& states) {
std::string message = "{\"session_id\":\"" + session_id_ + "\",\"type\":\"iot\",\"update\":true,\"states\":" + states + "}";
SendText(message);
}
void Protocol::SendMcpMessage(const std::string& payload) {
std::string message = "{\"session_id\":\"" + session_id_ + "\",\"type\":\"mcp\",\"payload\":" + payload + "}";
SendText(message);

View File

@@ -55,7 +55,7 @@ public:
return session_id_;
}
void OnIncomingAudio(std::function<void(AudioStreamPacket&& packet)> callback);
void OnIncomingAudio(std::function<void(std::unique_ptr<AudioStreamPacket> packet)> callback);
void OnIncomingJson(std::function<void(const cJSON* root)> callback);
void OnAudioChannelOpened(std::function<void()> callback);
void OnAudioChannelClosed(std::function<void()> callback);
@@ -65,18 +65,16 @@ public:
virtual bool OpenAudioChannel() = 0;
virtual void CloseAudioChannel() = 0;
virtual bool IsAudioChannelOpened() const = 0;
virtual bool SendAudio(const AudioStreamPacket& packet) = 0;
virtual bool SendAudio(std::unique_ptr<AudioStreamPacket> packet) = 0;
virtual void SendWakeWordDetected(const std::string& wake_word);
virtual void SendStartListening(ListeningMode mode);
virtual void SendStopListening();
virtual void SendAbortSpeaking(AbortReason reason);
virtual void SendIotDescriptors(const std::string& descriptors);
virtual void SendIotStates(const std::string& states);
virtual void SendMcpMessage(const std::string& message);
protected:
std::function<void(const cJSON* root)> on_incoming_json_;
std::function<void(AudioStreamPacket&& packet)> on_incoming_audio_;
std::function<void(std::unique_ptr<AudioStreamPacket> packet)> on_incoming_audio_;
std::function<void()> on_audio_channel_opened_;
std::function<void()> on_audio_channel_closed_;
std::function<void(const std::string& message)> on_network_error_;

View File

@@ -28,35 +28,35 @@ bool WebsocketProtocol::Start() {
return true;
}
bool WebsocketProtocol::SendAudio(const AudioStreamPacket& packet) {
bool WebsocketProtocol::SendAudio(std::unique_ptr<AudioStreamPacket> packet) {
if (websocket_ == nullptr || !websocket_->IsConnected()) {
return false;
}
if (version_ == 2) {
std::string serialized;
serialized.resize(sizeof(BinaryProtocol2) + packet.payload.size());
serialized.resize(sizeof(BinaryProtocol2) + packet->payload.size());
auto bp2 = (BinaryProtocol2*)serialized.data();
bp2->version = htons(version_);
bp2->type = 0;
bp2->reserved = 0;
bp2->timestamp = htonl(packet.timestamp);
bp2->payload_size = htonl(packet.payload.size());
memcpy(bp2->payload, packet.payload.data(), packet.payload.size());
bp2->timestamp = htonl(packet->timestamp);
bp2->payload_size = htonl(packet->payload.size());
memcpy(bp2->payload, packet->payload.data(), packet->payload.size());
return websocket_->Send(serialized.data(), serialized.size(), true);
} else if (version_ == 3) {
std::string serialized;
serialized.resize(sizeof(BinaryProtocol3) + packet.payload.size());
serialized.resize(sizeof(BinaryProtocol3) + packet->payload.size());
auto bp3 = (BinaryProtocol3*)serialized.data();
bp3->type = 0;
bp3->reserved = 0;
bp3->payload_size = htons(packet.payload.size());
memcpy(bp3->payload, packet.payload.data(), packet.payload.size());
bp3->payload_size = htons(packet->payload.size());
memcpy(bp3->payload, packet->payload.data(), packet->payload.size());
return websocket_->Send(serialized.data(), serialized.size(), true);
} else {
return websocket_->Send(packet.payload.data(), packet.payload.size(), true);
return websocket_->Send(packet->payload.data(), packet->payload.size(), true);
}
}
@@ -124,30 +124,30 @@ bool WebsocketProtocol::OpenAudioChannel() {
bp2->timestamp = ntohl(bp2->timestamp);
bp2->payload_size = ntohl(bp2->payload_size);
auto payload = (uint8_t*)bp2->payload;
on_incoming_audio_(AudioStreamPacket{
on_incoming_audio_(std::make_unique<AudioStreamPacket>(AudioStreamPacket{
.sample_rate = server_sample_rate_,
.frame_duration = server_frame_duration_,
.timestamp = bp2->timestamp,
.payload = std::vector<uint8_t>(payload, payload + bp2->payload_size)
});
}));
} else if (version_ == 3) {
BinaryProtocol3* bp3 = (BinaryProtocol3*)data;
bp3->type = bp3->type;
bp3->payload_size = ntohs(bp3->payload_size);
auto payload = (uint8_t*)bp3->payload;
on_incoming_audio_(AudioStreamPacket{
on_incoming_audio_(std::make_unique<AudioStreamPacket>(AudioStreamPacket{
.sample_rate = server_sample_rate_,
.frame_duration = server_frame_duration_,
.timestamp = 0,
.payload = std::vector<uint8_t>(payload, payload + bp3->payload_size)
});
}));
} else {
on_incoming_audio_(AudioStreamPacket{
on_incoming_audio_(std::make_unique<AudioStreamPacket>(AudioStreamPacket{
.sample_rate = server_sample_rate_,
.frame_duration = server_frame_duration_,
.timestamp = 0,
.payload = std::vector<uint8_t>((uint8_t*)data, (uint8_t*)data + len)
});
}));
}
}
} else {
@@ -214,9 +214,7 @@ std::string WebsocketProtocol::GetHelloMessage() {
#if CONFIG_USE_SERVER_AEC
cJSON_AddBoolToObject(features, "aec", true);
#endif
#if CONFIG_IOT_PROTOCOL_MCP
cJSON_AddBoolToObject(features, "mcp", true);
#endif
cJSON_AddItemToObject(root, "features", features);
cJSON_AddStringToObject(root, "transport", "websocket");
cJSON* audio_params = cJSON_CreateObject();

View File

@@ -16,7 +16,7 @@ public:
~WebsocketProtocol();
bool Start() override;
bool SendAudio(const AudioStreamPacket& packet) override;
bool SendAudio(std::unique_ptr<AudioStreamPacket> packet) override;
bool OpenAudioChannel() override;
void CloseAudioChannel() override;
bool IsAudioChannelOpened() const override;