forked from xiaozhi/xiaozhi-esp32
Audio stream packet with timestamp
This commit is contained in:
@@ -121,24 +121,24 @@ bool MqttProtocol::SendText(const std::string& text) {
|
||||
return true;
|
||||
}
|
||||
|
||||
void MqttProtocol::SendAudio(const std::vector<uint8_t>& data) {
|
||||
void MqttProtocol::SendAudio(const AudioStreamPacket& packet) {
|
||||
std::lock_guard<std::mutex> lock(channel_mutex_);
|
||||
if (udp_ == nullptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::string nonce(aes_nonce_);
|
||||
*(uint16_t*)&nonce[2] = htons(data.size());
|
||||
*(uint16_t*)&nonce[2] = htons(packet.payload.size());
|
||||
*(uint32_t*)&nonce[12] = htonl(++local_sequence_);
|
||||
|
||||
std::string encrypted;
|
||||
encrypted.resize(aes_nonce_.size() + data.size());
|
||||
encrypted.resize(aes_nonce_.size() + packet.payload.size());
|
||||
memcpy(encrypted.data(), nonce.data(), nonce.size());
|
||||
|
||||
size_t nc_off = 0;
|
||||
uint8_t stream_block[16] = {0};
|
||||
if (mbedtls_aes_crypt_ctr(&aes_ctx_, data.size(), &nc_off, (uint8_t*)nonce.c_str(), stream_block,
|
||||
(uint8_t*)data.data(), (uint8_t*)&encrypted[nonce.size()]) != 0) {
|
||||
if (mbedtls_aes_crypt_ctr(&aes_ctx_, packet.payload.size(), &nc_off, (uint8_t*)nonce.c_str(), stream_block,
|
||||
(uint8_t*)packet.payload.data(), (uint8_t*)&encrypted[nonce.size()]) != 0) {
|
||||
ESP_LOGE(TAG, "Failed to encrypt audio data");
|
||||
return;
|
||||
}
|
||||
@@ -229,20 +229,20 @@ bool MqttProtocol::OpenAudioChannel() {
|
||||
ESP_LOGW(TAG, "Received audio packet with wrong sequence: %lu, expected: %lu", sequence, remote_sequence_ + 1);
|
||||
}
|
||||
|
||||
std::vector<uint8_t> decrypted;
|
||||
size_t decrypted_size = data.size() - aes_nonce_.size();
|
||||
size_t nc_off = 0;
|
||||
uint8_t stream_block[16] = {0};
|
||||
decrypted.resize(decrypted_size);
|
||||
auto nonce = (uint8_t*)data.data();
|
||||
auto encrypted = (uint8_t*)data.data() + aes_nonce_.size();
|
||||
int ret = mbedtls_aes_crypt_ctr(&aes_ctx_, decrypted_size, &nc_off, nonce, stream_block, encrypted, (uint8_t*)decrypted.data());
|
||||
AudioStreamPacket packet;
|
||||
packet.payload.resize(decrypted_size);
|
||||
int ret = mbedtls_aes_crypt_ctr(&aes_ctx_, decrypted_size, &nc_off, nonce, stream_block, encrypted, (uint8_t*)packet.payload.data());
|
||||
if (ret != 0) {
|
||||
ESP_LOGE(TAG, "Failed to decrypt audio data, ret: %d", ret);
|
||||
return;
|
||||
}
|
||||
if (on_incoming_audio_ != nullptr) {
|
||||
on_incoming_audio_(std::move(decrypted));
|
||||
on_incoming_audio_(std::move(packet));
|
||||
}
|
||||
remote_sequence_ = sequence;
|
||||
last_incoming_time_ = std::chrono::steady_clock::now();
|
||||
|
||||
@@ -26,7 +26,7 @@ public:
|
||||
~MqttProtocol();
|
||||
|
||||
bool Start() override;
|
||||
void SendAudio(const std::vector<uint8_t>& data) override;
|
||||
void SendAudio(const AudioStreamPacket& packet) override;
|
||||
bool OpenAudioChannel() override;
|
||||
void CloseAudioChannel() override;
|
||||
bool IsAudioChannelOpened() const override;
|
||||
|
||||
@@ -8,7 +8,7 @@ void Protocol::OnIncomingJson(std::function<void(const cJSON* root)> callback) {
|
||||
on_incoming_json_ = callback;
|
||||
}
|
||||
|
||||
void Protocol::OnIncomingAudio(std::function<void(std::vector<uint8_t>&& data)> callback) {
|
||||
void Protocol::OnIncomingAudio(std::function<void(AudioStreamPacket&& packet)> callback) {
|
||||
on_incoming_audio_ = callback;
|
||||
}
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
#include <vector>
|
||||
|
||||
struct AudioStreamPacket {
|
||||
uint32_t timestamp;
|
||||
uint32_t timestamp = 0;
|
||||
std::vector<uint8_t> payload;
|
||||
};
|
||||
|
||||
@@ -53,7 +53,7 @@ public:
|
||||
return session_id_;
|
||||
}
|
||||
|
||||
void OnIncomingAudio(std::function<void(std::vector<uint8_t>&& data)> callback);
|
||||
void OnIncomingAudio(std::function<void(AudioStreamPacket&& packet)> callback);
|
||||
void OnIncomingJson(std::function<void(const cJSON* root)> callback);
|
||||
void OnAudioChannelOpened(std::function<void()> callback);
|
||||
void OnAudioChannelClosed(std::function<void()> callback);
|
||||
@@ -64,7 +64,7 @@ public:
|
||||
virtual void CloseAudioChannel() = 0;
|
||||
virtual bool IsAudioChannelOpened() const = 0;
|
||||
virtual bool IsAudioChannelBusy() const;
|
||||
virtual void SendAudio(const std::vector<uint8_t>& data) = 0;
|
||||
virtual void SendAudio(const AudioStreamPacket& packet) = 0;
|
||||
virtual void SendWakeWordDetected(const std::string& wake_word);
|
||||
virtual void SendStartListening(ListeningMode mode);
|
||||
virtual void SendStopListening();
|
||||
@@ -74,7 +74,7 @@ public:
|
||||
|
||||
protected:
|
||||
std::function<void(const cJSON* root)> on_incoming_json_;
|
||||
std::function<void(std::vector<uint8_t>&& data)> on_incoming_audio_;
|
||||
std::function<void(AudioStreamPacket&& packet)> on_incoming_audio_;
|
||||
std::function<void()> on_audio_channel_opened_;
|
||||
std::function<void()> on_audio_channel_closed_;
|
||||
std::function<void(const std::string& message)> on_network_error_;
|
||||
|
||||
@@ -28,40 +28,40 @@ bool WebsocketProtocol::Start() {
|
||||
return true;
|
||||
}
|
||||
|
||||
void WebsocketProtocol::SendAudio(const std::vector<uint8_t>& data) {
|
||||
void WebsocketProtocol::SendAudio(const AudioStreamPacket& packet) {
|
||||
if (websocket_ == nullptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (version_ == 2) {
|
||||
std::string packet;
|
||||
packet.resize(sizeof(BinaryProtocol2) + data.size());
|
||||
auto bp2 = (BinaryProtocol2*)packet.data();
|
||||
std::string serialized;
|
||||
serialized.resize(sizeof(BinaryProtocol2) + packet.payload.size());
|
||||
auto bp2 = (BinaryProtocol2*)serialized.data();
|
||||
bp2->version = htons(version_);
|
||||
bp2->type = 0;
|
||||
bp2->reserved = 0;
|
||||
bp2->timestamp = htonl(0);
|
||||
bp2->payload_size = htonl(data.size());
|
||||
memcpy(bp2->payload, data.data(), data.size());
|
||||
bp2->timestamp = htonl(packet.timestamp);
|
||||
bp2->payload_size = htonl(packet.payload.size());
|
||||
memcpy(bp2->payload, packet.payload.data(), packet.payload.size());
|
||||
|
||||
busy_sending_audio_ = true;
|
||||
websocket_->Send(packet.data(), packet.size(), true);
|
||||
websocket_->Send(serialized.data(), serialized.size(), true);
|
||||
busy_sending_audio_ = false;
|
||||
} else if (version_ == 3) {
|
||||
std::string packet;
|
||||
packet.resize(sizeof(BinaryProtocol3) + data.size());
|
||||
auto bp3 = (BinaryProtocol3*)packet.data();
|
||||
std::string serialized;
|
||||
serialized.resize(sizeof(BinaryProtocol3) + packet.payload.size());
|
||||
auto bp3 = (BinaryProtocol3*)serialized.data();
|
||||
bp3->type = 0;
|
||||
bp3->reserved = 0;
|
||||
bp3->payload_size = htons(data.size());
|
||||
memcpy(bp3->payload, data.data(), data.size());
|
||||
bp3->payload_size = htons(packet.payload.size());
|
||||
memcpy(bp3->payload, packet.payload.data(), packet.payload.size());
|
||||
|
||||
busy_sending_audio_ = true;
|
||||
websocket_->Send(packet.data(), packet.size(), true);
|
||||
websocket_->Send(serialized.data(), serialized.size(), true);
|
||||
busy_sending_audio_ = false;
|
||||
} else {
|
||||
busy_sending_audio_ = true;
|
||||
websocket_->Send(data.data(), data.size(), true);
|
||||
websocket_->Send(packet.payload.data(), packet.payload.size(), true);
|
||||
busy_sending_audio_ = false;
|
||||
}
|
||||
}
|
||||
@@ -130,15 +130,24 @@ bool WebsocketProtocol::OpenAudioChannel() {
|
||||
bp2->timestamp = ntohl(bp2->timestamp);
|
||||
bp2->payload_size = ntohl(bp2->payload_size);
|
||||
auto payload = (uint8_t*)bp2->payload;
|
||||
on_incoming_audio_(std::vector<uint8_t>(payload, payload + bp2->payload_size));
|
||||
on_incoming_audio_(AudioStreamPacket{
|
||||
.timestamp = bp2->timestamp,
|
||||
.payload = std::vector<uint8_t>(payload, payload + bp2->payload_size)
|
||||
});
|
||||
} else if (version_ == 3) {
|
||||
BinaryProtocol3* bp3 = (BinaryProtocol3*)data;
|
||||
bp3->type = bp3->type;
|
||||
bp3->payload_size = ntohs(bp3->payload_size);
|
||||
auto payload = (uint8_t*)bp3->payload;
|
||||
on_incoming_audio_(std::vector<uint8_t>(payload, payload + bp3->payload_size));
|
||||
on_incoming_audio_(AudioStreamPacket{
|
||||
.timestamp = 0,
|
||||
.payload = std::vector<uint8_t>(payload, payload + bp3->payload_size)
|
||||
});
|
||||
} else {
|
||||
on_incoming_audio_(std::vector<uint8_t>((uint8_t*)data, (uint8_t*)data + len));
|
||||
on_incoming_audio_(AudioStreamPacket{
|
||||
.timestamp = 0,
|
||||
.payload = std::vector<uint8_t>((uint8_t*)data, (uint8_t*)data + len)
|
||||
});
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
||||
@@ -16,7 +16,7 @@ public:
|
||||
~WebsocketProtocol();
|
||||
|
||||
bool Start() override;
|
||||
void SendAudio(const std::vector<uint8_t>& data) override;
|
||||
void SendAudio(const AudioStreamPacket& packet) override;
|
||||
bool OpenAudioChannel() override;
|
||||
void CloseAudioChannel() override;
|
||||
bool IsAudioChannelOpened() const override;
|
||||
|
||||
Reference in New Issue
Block a user