Files
xiaozhi-esp32/main/boards/common/esp32_camera.cc
Xiaoxia 3f54330ac9 Set minimal IDF version to 5.4.0 (#1339)
* version.py: not upload directory

* Downgrade IDF version to 5.4.0

---------

Co-authored-by: Xiaoxia <terrence.huang@tenclass.com>
2025-10-28 07:48:47 +08:00

701 lines
26 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#include "esp32_camera.h"
#include <fcntl.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/param.h>
#include <unistd.h>
#include "board.h"
#include "display.h"
#include "esp_video_device.h"
#include "esp_video_init.h"
#include "jpg/image_to_jpeg.h"
#include "linux/videodev2.h"
#include "lvgl_display.h"
#include "mcp_server.h"
#include "system_info.h"
#ifdef CONFIG_XIAOZHI_ENABLE_CAMERA_DEBUG_MODE
#undef LOG_LOCAL_LEVEL
#define LOG_LOCAL_LEVEL MAX(CONFIG_LOG_DEFAULT_LEVEL, ESP_LOG_DEBUG)
#endif // CONFIG_XIAOZHI_ENABLE_CAMERA_DEBUG_MODE
#include <errno.h>
#include <esp_heap_caps.h>
#include <esp_log.h>
#include <cstdio>
#include <cstring>
#define TAG "Esp32Camera"
#if defined(CONFIG_CAMERA_SENSOR_SWAP_PIXEL_BYTE_ORDER) || defined(CONFIG_XIAOZHI_ENABLE_CAMERA_ENDIANNESS_SWAP)
#warning "CAMERA_SENSOR_SWAP_PIXEL_BYTE_ORDER or CONFIG_XIAOZHI_ENABLE_CAMERA_ENDIANNESS_SWAP is enabled, which may cause image corruption in YUV422 format!"
#endif
#if CONFIG_XIAOZHI_ENABLE_CAMERA_DEBUG_MODE
#define CAM_PRINT_FOURCC(pixelformat) \
char fourcc[5]; \
fourcc[0] = pixelformat & 0xFF; \
fourcc[1] = (pixelformat >> 8) & 0xFF; \
fourcc[2] = (pixelformat >> 16) & 0xFF; \
fourcc[3] = (pixelformat >> 24) & 0xFF; \
fourcc[4] = '\0'; \
ESP_LOGD(TAG, "FOURCC: '%c%c%c%c'", fourcc[0], fourcc[1], fourcc[2], fourcc[3]);
static void log_available_video_devices() {
for (int i = 0; i < 50; i++) {
char path[16];
snprintf(path, sizeof(path), "/dev/video%d", i);
int fd = open(path, O_RDONLY);
if (fd >= 0) {
ESP_LOGD(TAG, "found video device: %s", path);
close(fd);
}
}
}
#else
#define CAM_PRINT_FOURCC(pixelformat) (void)0;
#endif // CONFIG_XIAOZHI_ENABLE_CAMERA_DEBUG_MODE
Esp32Camera::Esp32Camera(const esp_video_init_config_t& config) {
if (esp_video_init(&config) != ESP_OK) {
ESP_LOGE(TAG, "esp_video_init failed");
return;
}
#ifdef CONFIG_XIAOZHI_ENABLE_CAMERA_DEBUG_MODE
esp_log_level_set(TAG, ESP_LOG_DEBUG);
#endif // CONFIG_XIAOZHI_ENABLE_CAMERA_DEBUG_MODE
const char* video_device_name = nullptr;
if (false) { /* 用于构建 else if */
}
#if CONFIG_ESP_VIDEO_ENABLE_MIPI_CSI_VIDEO_DEVICE
else if (config.csi != nullptr) {
video_device_name = ESP_VIDEO_MIPI_CSI_DEVICE_NAME;
}
#endif
#if CONFIG_ESP_VIDEO_ENABLE_DVP_VIDEO_DEVICE
else if (config.dvp != nullptr) {
video_device_name = ESP_VIDEO_DVP_DEVICE_NAME;
}
#endif
#if CONFIG_ESP_VIDEO_ENABLE_HW_JPEG_VIDEO_DEVICE
else if (config.jpeg != nullptr) {
video_device_name = ESP_VIDEO_JPEG_DEVICE_NAME;
}
#endif
#if CONFIG_ESP_VIDEO_ENABLE_SPI_VIDEO_DEVICE
else if (config.spi != nullptr) {
video_device_name = ESP_VIDEO_SPI_DEVICE_NAME;
}
#endif
#if CONFIG_ESP_VIDEO_ENABLE_USB_UVC_VIDEO_DEVICE
else if (config.usb_uvc != nullptr) {
video_device_name = ESP_VIDEO_USB_UVC_DEVICE_NAME(config.usb_uvc->uvc.uvc_dev_num);
}
#endif
if (video_device_name == nullptr) {
ESP_LOGE(TAG, "no video device is enabled");
return;
}
video_fd_ = open(video_device_name, O_RDWR);
if (video_fd_ < 0) {
ESP_LOGE(TAG, "open %s failed, errno=%d(%s)", video_device_name, errno, strerror(errno));
#if CONFIG_XIAOZHI_ENABLE_CAMERA_DEBUG_MODE
log_available_video_devices();
#endif // CONFIG_XIAOZHI_ENABLE_CAMERA_DEBUG_MODE
return;
}
struct v4l2_capability cap = {};
if (ioctl(video_fd_, VIDIOC_QUERYCAP, &cap) != 0) {
ESP_LOGE(TAG, "VIDIOC_QUERYCAP failed, errno=%d(%s)", errno, strerror(errno));
close(video_fd_);
video_fd_ = -1;
return;
}
ESP_LOGD(
TAG,
"VIDIOC_QUERYCAP: driver=%s, card=%s, bus_info=%s, version=0x%08lx, capabilities=0x%08lx, device_caps=0x%08lx",
cap.driver, cap.card, cap.bus_info, cap.version, cap.capabilities, cap.device_caps);
struct v4l2_format format = {};
format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
if (ioctl(video_fd_, VIDIOC_G_FMT, &format) != 0) {
ESP_LOGE(TAG, "VIDIOC_G_FMT failed, errno=%d(%s)", errno, strerror(errno));
close(video_fd_);
video_fd_ = -1;
return;
}
ESP_LOGD(TAG, "VIDIOC_G_FMT: pixelformat=0x%08lx, width=%ld, height=%ld", format.fmt.pix.pixelformat,
format.fmt.pix.width, format.fmt.pix.height);
CAM_PRINT_FOURCC(format.fmt.pix.pixelformat);
struct v4l2_format setformat = {};
setformat.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
setformat.fmt.pix.width = format.fmt.pix.width;
setformat.fmt.pix.height = format.fmt.pix.height;
struct v4l2_fmtdesc fmtdesc = {};
fmtdesc.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
fmtdesc.index = 0;
uint32_t best_fmt = 0;
int best_rank = 1 << 30; // large number
// 优先级: YUV422P > RGB565 > RGB24 > GREY
// 注: 当前版本中 YUV422P 实际输出为 YUYV。YUYV 色彩格式在后续的处理中更节省内存空间。
auto get_rank = [](uint32_t fmt) -> int {
switch (fmt) {
case V4L2_PIX_FMT_YUV422P:
return 0;
case V4L2_PIX_FMT_RGB565:
return 1;
case V4L2_PIX_FMT_RGB24:
return 2;
case V4L2_PIX_FMT_GREY:
return 3;
default:
return 1 << 29; // unsupported
}
};
while (ioctl(video_fd_, VIDIOC_ENUM_FMT, &fmtdesc) == 0) {
ESP_LOGD(TAG, "VIDIOC_ENUM_FMT: pixelformat=0x%08lx, description=%s", fmtdesc.pixelformat, fmtdesc.description);
CAM_PRINT_FOURCC(fmtdesc.pixelformat);
int rank = get_rank(fmtdesc.pixelformat);
if (rank < best_rank) {
best_rank = rank;
best_fmt = fmtdesc.pixelformat;
}
fmtdesc.index++;
}
if (best_rank < (1 << 29)) {
setformat.fmt.pix.pixelformat = best_fmt;
sensor_format_ = best_fmt;
}
if (!setformat.fmt.pix.pixelformat) {
ESP_LOGE(TAG, "no supported pixel format found");
close(video_fd_);
video_fd_ = -1;
sensor_format_ = 0;
return;
}
ESP_LOGD(TAG, "selected pixel format: 0x%08lx", setformat.fmt.pix.pixelformat);
if (ioctl(video_fd_, VIDIOC_S_FMT, &setformat) != 0) {
ESP_LOGE(TAG, "VIDIOC_S_FMT failed, errno=%d(%s)", errno, strerror(errno));
close(video_fd_);
video_fd_ = -1;
sensor_format_ = 0;
return;
}
frame_.width = setformat.fmt.pix.width;
frame_.height = setformat.fmt.pix.height;
// 申请缓冲并mmap
struct v4l2_requestbuffers req = {};
req.count = strcmp(video_device_name, ESP_VIDEO_MIPI_CSI_DEVICE_NAME) == 0 ? 2 : 1;
req.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
req.memory = V4L2_MEMORY_MMAP;
if (ioctl(video_fd_, VIDIOC_REQBUFS, &req) != 0) {
ESP_LOGE(TAG, "VIDIOC_REQBUFS failed");
close(video_fd_);
video_fd_ = -1;
sensor_format_ = 0;
return;
}
mmap_buffers_.resize(req.count);
for (uint32_t i = 0; i < req.count; i++) {
struct v4l2_buffer buf = {};
buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
buf.memory = V4L2_MEMORY_MMAP;
buf.index = i;
if (ioctl(video_fd_, VIDIOC_QUERYBUF, &buf) != 0) {
ESP_LOGE(TAG, "VIDIOC_QUERYBUF failed");
close(video_fd_);
video_fd_ = -1;
sensor_format_ = 0;
return;
}
void* start = mmap(NULL, buf.length, PROT_READ | PROT_WRITE, MAP_SHARED, video_fd_, buf.m.offset);
if (start == MAP_FAILED) {
ESP_LOGE(TAG, "mmap failed");
close(video_fd_);
video_fd_ = -1;
sensor_format_ = 0;
return;
}
mmap_buffers_[i].start = start;
mmap_buffers_[i].length = buf.length;
if (ioctl(video_fd_, VIDIOC_QBUF, &buf) != 0) {
ESP_LOGE(TAG, "VIDIOC_QBUF failed");
close(video_fd_);
video_fd_ = -1;
sensor_format_ = 0;
return;
}
}
int type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
if (ioctl(video_fd_, VIDIOC_STREAMON, &type) != 0) {
ESP_LOGE(TAG, "VIDIOC_STREAMON failed");
close(video_fd_);
video_fd_ = -1;
sensor_format_ = 0;
return;
}
#ifdef CONFIG_ESP_VIDEO_ENABLE_ISP_VIDEO_DEVICE
// 当启用 ISP 时ISP 需要一些照片来初始化参数因此开启后后台拍摄5s照片并丢弃
xTaskCreate(
[](void* arg) {
Esp32Camera* self = static_cast<Esp32Camera*>(arg);
uint16_t capture_count = 0;
TickType_t start = xTaskGetTickCount();
TickType_t duration = 5000 / portTICK_PERIOD_MS; // 5s
while ((xTaskGetTickCount() - start) < duration) {
struct v4l2_buffer buf = {};
buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
buf.memory = V4L2_MEMORY_MMAP;
if (ioctl(self->video_fd_, VIDIOC_DQBUF, &buf) != 0) {
ESP_LOGE(TAG, "VIDIOC_DQBUF failed during init");
vTaskDelay(10 / portTICK_PERIOD_MS);
continue;
}
if (ioctl(self->video_fd_, VIDIOC_QBUF, &buf) != 0) {
ESP_LOGE(TAG, "VIDIOC_QBUF failed during init");
}
capture_count++;
}
ESP_LOGI(TAG, "Camera init success, captured %d frames in %dms", capture_count,
(xTaskGetTickCount() - start) * portTICK_PERIOD_MS);
self->streaming_on_ = true;
vTaskDelete(NULL);
},
"CameraInitTask", 4096, this, 5, nullptr);
#else
ESP_LOGI(TAG, "Camera init success");
streaming_on_ = true;
#endif // CONFIG_ESP_VIDEO_ENABLE_ISP_VIDEO_DEVICE
}
Esp32Camera::~Esp32Camera() {
if (streaming_on_ && video_fd_ >= 0) {
int type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
ioctl(video_fd_, VIDIOC_STREAMOFF, &type);
}
for (auto& b : mmap_buffers_) {
if (b.start && b.length) {
munmap(b.start, b.length);
}
}
if (video_fd_ >= 0) {
close(video_fd_);
video_fd_ = -1;
}
sensor_format_ = 0;
esp_video_deinit();
}
void Esp32Camera::SetExplainUrl(const std::string& url, const std::string& token) {
explain_url_ = url;
explain_token_ = token;
}
bool Esp32Camera::Capture() {
if (encoder_thread_.joinable()) {
encoder_thread_.join();
}
if (!streaming_on_ || video_fd_ < 0) {
return false;
}
for (int i = 0; i < 3; i++) {
struct v4l2_buffer buf = {};
buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
buf.memory = V4L2_MEMORY_MMAP;
if (ioctl(video_fd_, VIDIOC_DQBUF, &buf) != 0) {
ESP_LOGE(TAG, "VIDIOC_DQBUF failed");
return false;
}
if (i == 2) {
// 保存帧副本到PSRAM
if (frame_.data) {
heap_caps_free(frame_.data);
frame_.data = nullptr;
frame_.format = 0;
}
frame_.len = buf.bytesused;
frame_.data = (uint8_t*)heap_caps_malloc(frame_.len, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT);
if (!frame_.data) {
ESP_LOGE(TAG, "alloc frame copy failed");
return false;
}
ESP_LOGD(TAG, "frame.len = %d, frame.width = %d, frame.height = %d", frame_.len, frame_.width,
frame_.height);
ESP_LOG_BUFFER_HEXDUMP(TAG, frame_.data, MIN(frame_.len, 256), ESP_LOG_DEBUG);
switch (sensor_format_) {
case V4L2_PIX_FMT_RGB565:
case V4L2_PIX_FMT_RGB24:
case V4L2_PIX_FMT_YUYV:
#ifdef CONFIG_XIAOZHI_ENABLE_CAMERA_ENDIANNESS_SWAP
{
auto src16 = (uint16_t*)mmap_buffers_[buf.index].start;
auto dst16 = (uint16_t*)frame_.data;
size_t count = (size_t)mmap_buffers_[buf.index].length / 2;
for (size_t i = 0; i < count; i++) {
dst16[i] = __builtin_bswap16(src16[i]);
}
}
#else
memcpy(frame_.data, mmap_buffers_[buf.index].start, frame_.len);
#endif // CONFIG_XIAOZHI_ENABLE_CAMERA_ENDIANNESS_SWAP
frame_.format = sensor_format_;
break;
case V4L2_PIX_FMT_YUV422P: {
// 这个格式是 422 YUYV不是 planer
frame_.format = V4L2_PIX_FMT_YUYV;
#ifdef CONFIG_XIAOZHI_ENABLE_CAMERA_ENDIANNESS_SWAP
{
auto src16 = (uint16_t*)mmap_buffers_[buf.index].start;
auto dst16 = (uint16_t*)frame_.data;
size_t count = (size_t)mmap_buffers_[buf.index].length / 2;
for (size_t i = 0; i < count; i++) {
dst16[i] = __builtin_bswap16(src16[i]);
}
}
#else
memcpy(frame_.data, mmap_buffers_[buf.index].start, frame_.len);
#endif // CONFIG_XIAOZHI_ENABLE_CAMERA_ENDIANNESS_SWAP
break;
}
case V4L2_PIX_FMT_RGB565X: {
// 大端序的 RGB565 需要转换为小端序
// 目前 esp_video 的大小端都会返回格式为 RGB565不会返回格式为 RGB565X此 case 用于未来版本兼容
auto src16 = (uint16_t*)mmap_buffers_[buf.index].start;
auto dst16 = (uint16_t*)frame_.data;
size_t pixel_count = (size_t)frame_.width * (size_t)frame_.height;
for (size_t i = 0; i < pixel_count; i++) {
dst16[i] = __builtin_bswap16(src16[i]);
}
frame_.format = V4L2_PIX_FMT_RGB565;
break;
}
default:
ESP_LOGE(TAG, "unsupported sensor format: 0x%08lx", sensor_format_);
return false;
}
}
if (ioctl(video_fd_, VIDIOC_QBUF, &buf) != 0) {
ESP_LOGE(TAG, "VIDIOC_QBUF failed");
}
}
// 显示预览图片
auto display = dynamic_cast<LvglDisplay*>(Board::GetInstance().GetDisplay());
if (display != nullptr) {
if (!frame_.data) {
return false;
}
uint16_t w = frame_.width;
uint16_t h = frame_.height;
size_t lvgl_image_size = frame_.len;
size_t stride = ((w * 2) + 3) & ~3; // 4字节对齐
lv_color_format_t color_format = LV_COLOR_FORMAT_RGB565;
uint8_t* data = nullptr;
switch (frame_.format) {
case V4L2_PIX_FMT_YUYV:
// color_format = LV_COLOR_FORMAT_YUY2;
// [[fallthrough]];
// LV_COLOR_FORMAT_YUY2 的显示似乎有问题,暂时转换为 RGB565 显示
{
color_format = LV_COLOR_FORMAT_RGB565;
data = (uint8_t*)heap_caps_malloc(w * h * 2, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT);
lvgl_image_size = w * h * 2;
if (data == nullptr) {
ESP_LOGE(TAG, "Failed to allocate memory for preview image");
return false;
}
const uint8_t* src = (const uint8_t*)frame_.data;
size_t src_len = frame_.len;
size_t dst_off = 0;
auto clamp = [](int v) -> uint8_t {
if (v < 0) return 0;
if (v > 255) return 255;
return (uint8_t)v;
};
// 每 4 字节处理两个像素: Y0 U Y1 V
for (size_t i = 0; i + 3 < src_len; i += 4) {
int y0 = (int)src[i + 0];
int u = (int)src[i + 1];
int y1 = (int)src[i + 2];
int v = (int)src[i + 3];
int c0 = y0 - 16;
int c1 = y1 - 16;
int d = u - 128;
int e = v - 128;
// 常用整数近似转换
int r0 = (298 * c0 + 409 * e + 128) >> 8;
int g0 = (298 * c0 - 100 * d - 208 * e + 128) >> 8;
int b0 = (298 * c0 + 516 * d + 128) >> 8;
int r1 = (298 * c1 + 409 * e + 128) >> 8;
int g1 = (298 * c1 - 100 * d - 208 * e + 128) >> 8;
int b1 = (298 * c1 + 516 * d + 128) >> 8;
uint8_t cr0 = clamp(r0);
uint8_t cg0 = clamp(g0);
uint8_t cb0 = clamp(b0);
uint8_t cr1 = clamp(r1);
uint8_t cg1 = clamp(g1);
uint8_t cb1 = clamp(b1);
// RGB565 打包
uint16_t pix0 = (uint16_t)(((cr0 >> 3) << 11) | ((cg0 >> 2) << 5) | (cb0 >> 3));
uint16_t pix1 = (uint16_t)(((cr1 >> 3) << 11) | ((cg1 >> 2) << 5) | (cb1 >> 3));
// 小端序:低字节先写入
data[dst_off++] = (uint8_t)(pix0 & 0xFF);
data[dst_off++] = (uint8_t)((pix0 >> 8) & 0xFF);
data[dst_off++] = (uint8_t)(pix1 & 0xFF);
data[dst_off++] = (uint8_t)((pix1 >> 8) & 0xFF);
}
break;
}
case V4L2_PIX_FMT_RGB565:
// 默认的 color_format 就是 LV_COLOR_FORMAT_RGB565
data = (uint8_t*)heap_caps_malloc(w * h * 2, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT);
if (data == nullptr) {
ESP_LOGE(TAG, "Failed to allocate memory for preview image");
return false;
}
memcpy(data, frame_.data, frame_.len);
lvgl_image_size = frame_.len; // fallthrough 时兼顾 YUYV 与 RGB565
break;
case V4L2_PIX_FMT_RGB24: {
// RGB888 需要转换为 RGB565
color_format = LV_COLOR_FORMAT_RGB565;
data = (uint8_t*)heap_caps_malloc(w * h * 2, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT);
uint16_t* dst16 = (uint16_t*)data;
if (data == nullptr) {
ESP_LOGE(TAG, "Failed to allocate memory for preview image");
return false;
}
const uint8_t* src = frame_.data;
size_t pixel_count = (size_t)w * (size_t)h;
for (size_t i = 0; i < pixel_count; i++) {
uint8_t r = src[i * 3 + 0];
uint8_t g = src[i * 3 + 1];
uint8_t b = src[i * 3 + 2];
dst16[i] = (uint16_t)(((r >> 3) << 11) | ((g >> 2) << 5) | (b >> 3));
}
lvgl_image_size = w * h * 2;
break;
}
default:
ESP_LOGE(TAG, "unsupported frame format: 0x%08lx", frame_.format);
return false;
}
auto image = std::make_unique<LvglAllocatedImage>(data, lvgl_image_size, w, h, stride, color_format);
display->SetPreviewImage(std::move(image));
}
return true;
}
bool Esp32Camera::SetHMirror(bool enabled) {
if (video_fd_ < 0)
return false;
struct v4l2_ext_controls ctrls = {};
struct v4l2_ext_control ctrl = {};
ctrl.id = V4L2_CID_HFLIP;
ctrl.value = enabled ? 1 : 0;
ctrls.ctrl_class = V4L2_CTRL_CLASS_USER;
ctrls.count = 1;
ctrls.controls = &ctrl;
if (ioctl(video_fd_, VIDIOC_S_EXT_CTRLS, &ctrls) != 0) {
ESP_LOGE(TAG, "set HFLIP failed");
return false;
}
return true;
}
bool Esp32Camera::SetVFlip(bool enabled) {
if (video_fd_ < 0)
return false;
struct v4l2_ext_controls ctrls = {};
struct v4l2_ext_control ctrl = {};
ctrl.id = V4L2_CID_VFLIP;
ctrl.value = enabled ? 1 : 0;
ctrls.ctrl_class = V4L2_CTRL_CLASS_USER;
ctrls.count = 1;
ctrls.controls = &ctrl;
if (ioctl(video_fd_, VIDIOC_S_EXT_CTRLS, &ctrls) != 0) {
ESP_LOGE(TAG, "set VFLIP failed");
return false;
}
return true;
}
/**
* @brief 将摄像头捕获的图像发送到远程服务器进行AI分析和解释
*
* 该函数将当前摄像头缓冲区中的图像编码为JPEG格式并通过HTTP POST请求
* 以multipart/form-data的形式发送到指定的解释服务器。服务器将根据提供的
* 问题对图像进行AI分析并返回结果。
*
* 实现特点:
* - 使用独立线程编码JPEG与主线程分离
* - 采用分块传输编码(chunked transfer encoding)优化内存使用
* - 通过队列机制实现编码线程和发送线程的数据同步
* - 支持设备ID、客户端ID和认证令牌的HTTP头部配置
*
* @param question 要向AI提出的关于图像的问题将作为表单字段发送
* @return std::string 服务器返回的JSON格式响应字符串
* 成功时包含AI分析结果失败时包含错误信息
* 格式示例:{"success": true, "result": "分析结果"}
* {"success": false, "message": "错误信息"}
*
* @note 调用此函数前必须先调用SetExplainUrl()设置服务器URL
* @note 函数会等待之前的编码线程完成后再开始新的处理
* @warning 如果摄像头缓冲区为空或网络连接失败,将返回错误信息
*/
std::string Esp32Camera::Explain(const std::string& question) {
if (explain_url_.empty()) {
throw std::runtime_error("Image explain URL or token is not set");
}
// 创建局部的 JPEG 队列, 40 entries is about to store 512 * 40 = 20480 bytes of JPEG data
QueueHandle_t jpeg_queue = xQueueCreate(40, sizeof(JpegChunk));
if (jpeg_queue == nullptr) {
ESP_LOGE(TAG, "Failed to create JPEG queue");
throw std::runtime_error("Failed to create JPEG queue");
}
// We spawn a thread to encode the image to JPEG using optimized encoder (cost about 500ms and 8KB SRAM)
encoder_thread_ = std::thread([this, jpeg_queue]() {
uint16_t w = frame_.width ? frame_.width : 320;
uint16_t h = frame_.height ? frame_.height : 240;
v4l2_pix_fmt_t enc_fmt = frame_.format;
image_to_jpeg_cb(
frame_.data, frame_.len, w, h, enc_fmt, 80,
[](void* arg, size_t index, const void* data, size_t len) -> size_t {
auto jpeg_queue = (QueueHandle_t)arg;
JpegChunk chunk = {.data = (uint8_t*)heap_caps_aligned_alloc(16, len, MALLOC_CAP_SPIRAM), .len = len};
memcpy(chunk.data, data, len);
xQueueSend(jpeg_queue, &chunk, portMAX_DELAY);
return len;
},
jpeg_queue);
});
auto network = Board::GetInstance().GetNetwork();
auto http = network->CreateHttp(3);
// 构造multipart/form-data请求体
std::string boundary = "----ESP32_CAMERA_BOUNDARY";
// 配置HTTP客户端使用分块传输编码
http->SetHeader("Device-Id", SystemInfo::GetMacAddress().c_str());
http->SetHeader("Client-Id", Board::GetInstance().GetUuid().c_str());
if (!explain_token_.empty()) {
http->SetHeader("Authorization", "Bearer " + explain_token_);
}
http->SetHeader("Content-Type", "multipart/form-data; boundary=" + boundary);
http->SetHeader("Transfer-Encoding", "chunked");
if (!http->Open("POST", explain_url_)) {
ESP_LOGE(TAG, "Failed to connect to explain URL");
// Clear the queue
encoder_thread_.join();
JpegChunk chunk;
while (xQueueReceive(jpeg_queue, &chunk, portMAX_DELAY) == pdPASS) {
if (chunk.data != nullptr) {
heap_caps_free(chunk.data);
} else {
break;
}
}
vQueueDelete(jpeg_queue);
throw std::runtime_error("Failed to connect to explain URL");
}
{
// 第一块question字段
std::string question_field;
question_field += "--" + boundary + "\r\n";
question_field += "Content-Disposition: form-data; name=\"question\"\r\n";
question_field += "\r\n";
question_field += question + "\r\n";
http->Write(question_field.c_str(), question_field.size());
}
{
// 第二块:文件字段头部
std::string file_header;
file_header += "--" + boundary + "\r\n";
file_header += "Content-Disposition: form-data; name=\"file\"; filename=\"camera.jpg\"\r\n";
file_header += "Content-Type: image/jpeg\r\n";
file_header += "\r\n";
http->Write(file_header.c_str(), file_header.size());
}
// 第三块JPEG数据
size_t total_sent = 0;
while (true) {
JpegChunk chunk;
if (xQueueReceive(jpeg_queue, &chunk, portMAX_DELAY) != pdPASS) {
ESP_LOGE(TAG, "Failed to receive JPEG chunk");
break;
}
if (chunk.data == nullptr) {
break; // The last chunk
}
http->Write((const char*)chunk.data, chunk.len);
total_sent += chunk.len;
heap_caps_free(chunk.data);
}
// Wait for the encoder thread to finish
encoder_thread_.join();
// 清理队列
vQueueDelete(jpeg_queue);
{
// 第四块multipart尾部
std::string multipart_footer;
multipart_footer += "\r\n--" + boundary + "--\r\n";
http->Write(multipart_footer.c_str(), multipart_footer.size());
}
// 结束块
http->Write("", 0);
if (http->GetStatusCode() != 200) {
ESP_LOGE(TAG, "Failed to upload photo, status code: %d", http->GetStatusCode());
throw std::runtime_error("Failed to upload photo");
}
std::string result = http->ReadAll();
http->Close();
// Get remain task stack size
size_t remain_stack_size = uxTaskGetStackHighWaterMark(nullptr);
ESP_LOGI(TAG, "Explain image size=%d bytes, compressed size=%d, remain stack size=%d, question=%s\n%s",
(int)frame_.len, (int)total_sent, (int)remain_stack_size, question.c_str(), result.c_str());
return result;
}