Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(video): reduce branching for faster video render task #89

Merged
merged 2 commits into from
Oct 15, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 51 additions & 39 deletions components/box-emu/src/box-emu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -487,12 +487,12 @@ void BoxEmu::palette(const uint16_t *palette, size_t size) {
palette_size_ = size;
}

void BoxEmu::push_frame(const void* frame) {
void IRAM_ATTR BoxEmu::push_frame(const void* frame) {
if (video_queue_ == nullptr) {
logger_.error("video queue is null, make sure to call initialize_video() first!");
return;
}
xQueueSend(video_queue_, &frame, 10 / portTICK_PERIOD_MS);
xQueueSend(video_queue_, &frame, 5 / portTICK_PERIOD_MS);
}

VideoSetting BoxEmu::video_setting() const {
Expand Down Expand Up @@ -717,28 +717,23 @@ const uint16_t* BoxEmu::palette() const {

bool BoxEmu::video_task_callback(std::mutex &m, std::condition_variable& cv) {
const void *_frame_ptr;
if (xQueuePeek(video_queue_, &_frame_ptr, 100 / portTICK_PERIOD_MS) != pdTRUE) {
// we couldn't get anything from the queue, return
return false;
}
if (_frame_ptr == nullptr) {
// make sure we clear the queue
xQueueReceive(video_queue_, &_frame_ptr, 10 / portTICK_PERIOD_MS);
// we got a nullptr, return
if (xQueueReceive(video_queue_, &_frame_ptr, portMAX_DELAY) != pdTRUE) {
return false;
}
static constexpr int num_lines_to_write = num_rows_in_framebuffer;
auto &box = espp::EspBox::get();
static int vram_index = 0; // has to be static so that it persists between calls
static uint16_t vram_index = 0; // has to be static so that it persists between calls
const int _x_offset = x_offset();
const int _y_offset = y_offset();
const uint16_t* _palette = palette();
uint16_t *vram0 = (uint16_t*)box.vram0();
uint16_t *vram1 = (uint16_t*)box.vram1();
if (is_native()) {
for (int y=0; y<display_height_; y+= num_lines_to_write) {
uint16_t* _buf = vram_index ? (uint16_t*)box.vram1() : (uint16_t*)box.vram0();
vram_index = vram_index ? 0 : 1;
int num_lines = std::min<int>(num_lines_to_write, display_height_-y);
if (has_palette()) {
if (has_palette()) {
for (int y=0; y<display_height_; y+= num_lines_to_write) {
uint16_t* _buf = (uint16_t*)((uint32_t)vram0 * (vram_index ^ 0x01) + (uint32_t)vram1 * vram_index);
vram_index = vram_index ^ 0x01;
int num_lines = std::min<int>(num_lines_to_write, display_height_-y);
const uint8_t* _frame = (const uint8_t*)_frame_ptr;
for (int i=0; i<num_lines; i++) {
// write two pixels (32 bits) at a time because it's faster
Expand All @@ -749,7 +744,14 @@ bool BoxEmu::video_task_callback(std::mutex &m, std::condition_variable& cv) {
_buf[dst_index + 1] = _palette[_frame[src_index + 1] % palette_size_];
}
}
} else {
box.write_lcd_frame(_x_offset, y + _y_offset, display_width_, num_lines, (uint8_t*)&_buf[0]);
}
} else {
// no palette
for (int y=0; y<display_height_; y+= num_lines_to_write) {
uint16_t* _buf = (uint16_t*)((uint32_t)vram0 * (vram_index ^ 0x01) + (uint32_t)vram1 * vram_index);
vram_index = vram_index ^ 0x01;
int num_lines = std::min<int>(num_lines_to_write, display_height_-y);
const uint16_t* _frame = (const uint16_t*)_frame_ptr;
for (int i=0; i<num_lines; i++) {
// write two pixels (32 bits) at a time because it's faster
Expand All @@ -761,8 +763,8 @@ bool BoxEmu::video_task_callback(std::mutex &m, std::condition_variable& cv) {
_buf[dst_index + 1] = _frame[src_index + 1];
}
}
box.write_lcd_frame(_x_offset, y + _y_offset, display_width_, num_lines, (uint8_t*)&_buf[0]);
}
box.write_lcd_frame(_x_offset, y + _y_offset, display_width_, num_lines, (uint8_t*)&_buf[0]);
}
} else {
// we are scaling the screen (and possibly using a custom palette)
Expand All @@ -773,21 +775,19 @@ bool BoxEmu::video_task_callback(std::mutex &m, std::condition_variable& cv) {
float inv_y_scale = (float)native_height_/display_height_;
int max_y = espp::EspBox::lcd_height();
int max_x = std::clamp<int>(x_scale * native_width_, 0, espp::EspBox::lcd_width());
for (int y=0; y<max_y; y+=num_lines_to_write) {
// each iteration of the loop, we swap the vram index so that we can
// write to the other buffer while the other one is being transmitted
int i = 0;
uint16_t* _buf = vram_index ? (uint16_t*)box.vram1() : (uint16_t*)box.vram0();
vram_index = vram_index ? 0 : 1;
for (; i<num_lines_to_write; i++) {
int _y = y+i;
if (_y >= max_y) {
break;
}
int source_y = (float)_y * inv_y_scale;
// shoudl i put this around the outer loop or is this loop a good
// balance for perfomance of the check?
if (has_palette()) {
if (has_palette()) {
for (int y=0; y<max_y; y+=num_lines_to_write) {
// each iteration of the loop, we swap the vram index so that we can
// write to the other buffer while the other one is being transmitted
int i = 0;
uint16_t* _buf = (uint16_t*)((uint32_t)vram0 * (vram_index ^ 0x01) + (uint32_t)vram1 * vram_index);
vram_index = vram_index ^ 0x01;
for (; i<num_lines_to_write; i++) {
int _y = y+i;
if (_y >= max_y) {
break;
}
int source_y = (float)_y * inv_y_scale;
const uint8_t* _frame = (const uint8_t*)_frame_ptr;
// write two pixels (32 bits) at a time because it's faster
for (int x=0; x<max_x/2; x++) {
Expand All @@ -797,7 +797,23 @@ bool BoxEmu::video_task_callback(std::mutex &m, std::condition_variable& cv) {
_buf[dst_index] = _palette[_frame[src_index] % palette_size_];
_buf[dst_index + 1] = _palette[_frame[src_index + 1] % palette_size_];
}
} else {
}
box.write_lcd_frame(0 + _x_offset, y, max_x, i, (uint8_t*)&_buf[0]);
}
} else {
// no palette
for (int y=0; y<max_y; y+=num_lines_to_write) {
// each iteration of the loop, we swap the vram index so that we can
// write to the other buffer while the other one is being transmitted
int i = 0;
uint16_t* _buf = (uint16_t*)((uint32_t)vram0 * (vram_index ^ 0x01) + (uint32_t)vram1 * vram_index);
vram_index = vram_index ^ 0x01;
for (; i<num_lines_to_write; i++) {
int _y = y+i;
if (_y >= max_y) {
break;
}
int source_y = (float)_y * inv_y_scale;
const uint16_t* _frame = (const uint16_t*)_frame_ptr;
// write two pixels (32 bits) at a time because it's faster
for (int x=0; x<max_x/2; x++) {
Expand All @@ -808,13 +824,9 @@ bool BoxEmu::video_task_callback(std::mutex &m, std::condition_variable& cv) {
_buf[dst_index + 1] = _frame[src_index + 1];
}
}
box.write_lcd_frame(0 + _x_offset, y, max_x, i, (uint8_t*)&_buf[0]);
}
box.write_lcd_frame(0 + _x_offset, y, max_x, i, (uint8_t*)&_buf[0]);
}
}

// we don't have to worry here since we know there was an item in the queue
// since we peeked earlier.
xQueueReceive(video_queue_, &_frame_ptr, 10 / portTICK_PERIOD_MS);
return false;
}