From 4bc4bf65420ab8bf7ce7c47da70bd4b302048566 Mon Sep 17 00:00:00 2001 From: zoeyjodon Date: Thu, 9 May 2024 18:17:13 -0400 Subject: [PATCH] Pull in ds84182's shader code to make video processing WAAAYYY faster. Replace base renderer interface with an abstract class. Add common pixel writing logic in the base renderer class. --- src/video/n3ds/N3dsRenderer.hpp | 68 +++-- src/video/n3ds/N3dsRendererBase.cpp | 339 ++++++++++++++++++++++ src/video/n3ds/N3dsRendererBottom.cpp | 58 +--- src/video/n3ds/N3dsRendererDefault.cpp | 23 +- src/video/n3ds/N3dsRendererDualScreen.cpp | 3 +- src/video/n3ds_video.cpp | 19 +- src/video/n3ds_video_mvd.cpp | 97 +------ src/video/video.h | 3 + src/video/vshader.v.pica | 31 ++ 9 files changed, 453 insertions(+), 188 deletions(-) create mode 100644 src/video/n3ds/N3dsRendererBase.cpp create mode 100644 src/video/vshader.v.pica diff --git a/src/video/n3ds/N3dsRenderer.hpp b/src/video/n3ds/N3dsRenderer.hpp index 3f9825fd..8abd6625 100644 --- a/src/video/n3ds/N3dsRenderer.hpp +++ b/src/video/n3ds/N3dsRenderer.hpp @@ -20,30 +20,43 @@ #include <3ds.h> #include -class IN3dsRenderer { +#define MOON_CTR_VIDEO_TEX_W 1024 +#define MOON_CTR_VIDEO_TEX_H 512 +#define CMDLIST_SZ 0x800 + +class N3dsRendererBase { public: - IN3dsRenderer(int surface_width_in) { surface_width = surface_width_in; } + N3dsRendererBase(int surface_width_in, int surface_height_in, + int image_width_in, int image_height_in, int pixel_size); + ~N3dsRendererBase(); virtual void write_px_to_framebuffer(uint8_t *source, int px_size) = 0; + public: + u64 perf_frame_target_ticks = SYSCLOCK_ARM11 * ((double)(1.0 / 60.0)); + u64 perf_decode_ticks; + u64 perf_fbcopy_ticks; + + protected: + inline void draw_perf_counters(uint8_t *__restrict dest, int px_size); + void write_px_to_framebuffer_gpu(uint8_t *__restrict source, + uint8_t *__restrict dest, + uint8_t *__restrict dest_debug, + int px_size); + void ensure_3d_enabled(); + void ensure_3d_disabled(); + inline void write24(u8 *p, u32 val); + protected: int surface_width; - void ensure_3d_enabled() { - if (!gfxIs3D()) { - gfxSetWide(false); - gfxSet3D(true); - } - } - void ensure_3d_disabled() { - if (gfxIs3D()) { - gfxSet3D(false); - } - if (surface_width == GSP_SCREEN_HEIGHT_TOP_2X) { - gfxSetWide(true); - } - } + int surface_height; + int image_width; + int image_height; + u32 *cmdlist = NULL; + void *vramFb = NULL; + void *vramTex = NULL; }; -class N3dsRendererDefault : public IN3dsRenderer { +class N3dsRendererDefault : public N3dsRendererBase { public: N3dsRendererDefault(int dest_width, int dest_height, int src_width, int src_height, int px_size); @@ -68,8 +81,11 @@ class N3dsRendererDefault : public IN3dsRenderer { int src_width, int src_height, int px_size); - inline void write_px_to_framebuffer_2D(uint8_t *source, int px_size); - inline void write_px_to_framebuffer_3D(uint8_t *source, int px_size); + inline void write_px_to_framebuffer_2D(uint8_t *__restrict source, + uint8_t *__restrict scratch, + int px_size); + inline void write_px_to_framebuffer_3D(uint8_t *__restrict source, + int px_size); private: int offset_lut_size; @@ -82,24 +98,14 @@ class N3dsRendererDefault : public IN3dsRenderer { int *src_offset_lut_3d_r; }; -class N3dsRendererBottom : public IN3dsRenderer { +class N3dsRendererBottom : public N3dsRendererBase { public: N3dsRendererBottom(int src_width, int src_height, int px_size); ~N3dsRendererBottom(); void write_px_to_framebuffer(uint8_t *source, int px_size); - - private: - inline int get_dest_offset(int x, int y, int dest_height); - inline int get_source_offset(int x, int y, int src_width, int src_height, - int dest_width, int dest_height); - - private: - int offset_lut_size; - int *dest_offset_lut; - int *src_offset_lut; }; -class N3dsRendererDualScreen : public IN3dsRenderer { +class N3dsRendererDualScreen : public N3dsRendererBase { public: N3dsRendererDualScreen(int dest_width, int dest_height, int src_width, int src_height, int px_size); diff --git a/src/video/n3ds/N3dsRendererBase.cpp b/src/video/n3ds/N3dsRendererBase.cpp new file mode 100644 index 00000000..1f44fbf4 --- /dev/null +++ b/src/video/n3ds/N3dsRendererBase.cpp @@ -0,0 +1,339 @@ +/* + * This file is part of Moonlight Embedded. + * + * Copyright (C) 2015 Iwan Timmer + * + * Moonlight is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * Moonlight is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Moonlight; if not, see . + */ + +#include "N3dsRenderer.hpp" +#include "vshader_shbin.h" + +#include <3ds.h> +#include +#include +#include +#include +#include + +N3dsRendererBase::N3dsRendererBase(int surface_width_in, int surface_height_in, + int image_width_in, int image_height_in, + int pixel_size) + : surface_width(surface_width_in), surface_height(surface_height_in), + image_width(image_width_in), image_height(image_height_in) { + cmdlist = (u32 *)linearAlloc(CMDLIST_SZ * 4); + vramFb = vramAlloc(surface_width * surface_height * pixel_size); + // Needs to be able to hold an 800x480 + vramTex = + vramAlloc(MOON_CTR_VIDEO_TEX_W * MOON_CTR_VIDEO_TEX_H * pixel_size); +} + +N3dsRendererBase::~N3dsRendererBase() { + linearFree(cmdlist); + vramFree(vramFb); + vramFree(vramTex); +} + +void N3dsRendererBase::ensure_3d_enabled() { + if (!gfxIs3D()) { + gfxSetWide(false); + gfxSet3D(true); + } +} + +void N3dsRendererBase::ensure_3d_disabled() { + if (gfxIs3D()) { + gfxSet3D(false); + } + if (surface_width == GSP_SCREEN_HEIGHT_TOP_2X) { + gfxSetWide(true); + } +} + +inline void N3dsRendererBase::write24(u8 *p, u32 val) { + p[0] = val; + p[1] = val >> 8; + p[2] = val >> 16; +} + +inline void N3dsRendererBase::draw_perf_counters(uint8_t *__restrict dest, + int px_size) { + // Use a line going across the first scanline (left) for the perf counters. + // Clear to black + memset(dest, 0, GSP_SCREEN_WIDTH * 3); + + // Display frame target in the middle of the screen. + double perf_tick_divisor = + ((double)GSP_SCREEN_WIDTH) / ((double)(perf_frame_target_ticks * 2)); + u32 perf_px = 0; + u32 perf_tmp_height = 0; + +#define PERF_DRAW(ticks, r, g, b) \ + perf_tmp_height = perf_tick_divisor * ((double)(ticks)); \ + do { \ + if (perf_px > GSP_SCREEN_WIDTH) \ + break; \ + const u32 color = (r << 16) | (g << 8) | b; \ + memcpy(dest + (perf_px * 3), &color, 3); \ + perf_px++; \ + } while (perf_tmp_height-- > 0); + + PERF_DRAW(perf_decode_ticks, 255, 0, 0); + PERF_DRAW(perf_fbcopy_ticks, 0, 0, 255); + + // Draw two green pixels at the center + perf_px = (GSP_SCREEN_WIDTH / 2) - 1; + PERF_DRAW(0, 0, 255, 0); + PERF_DRAW(0, 0, 255, 0); +} + +void N3dsRendererBase::write_px_to_framebuffer_gpu( + uint8_t *__restrict source, uint8_t *__restrict dest, + uint8_t *__restrict dest_debug, int px_size) { + // TODO: Do nothing when GPU right is lost. Currently hangs when going to + // the home menu. + + u64 start_ticks = svcGetSystemTick(); + + // NOTE: At 800x480, we can display the _width_ natively, but the height + // needs to be downsampled. MVD is incapable of downsampling, so we have to + // do it on the GPU. + + // TODO: If we can use rotation from the decoder, we can do a 2x downscale + // using display transfer and skip P3D. Not necessary because PICA is + // significantly faster than the decoder. + + // Tile the source image into the scratch buffer. + GX_DisplayTransfer( + (u32 *)source, + GX_BUFFER_DIM(MOON_CTR_VIDEO_TEX_W, MOON_CTR_VIDEO_TEX_H), + (u32 *)vramTex, + GX_BUFFER_DIM(MOON_CTR_VIDEO_TEX_W, MOON_CTR_VIDEO_TEX_H), + GX_TRANSFER_FLIP_VERT(1) | GX_TRANSFER_OUT_TILED(1) | + GX_TRANSFER_IN_FORMAT(GX_TRANSFER_FMT_RGB565) | + GX_TRANSFER_OUT_FORMAT(GX_TRANSFER_FMT_RGB565)); + + // While the transfer is running, create a temporary command list to rotate + // the framebuffer into source + GPUCMD_SetBuffer(cmdlist, CMDLIST_SZ, 0); + + // TODO: Verify this mitigates rounding errors due to f24 precision issues. + +#define C GPUCMD_AddWrite + + C(GPUREG_FRAMEBUFFER_INVALIDATE, 1); + C(GPUREG_COLORBUFFER_LOC, osConvertVirtToPhys(vramFb) >> 3); + C(GPUREG_DEPTHBUFFER_LOC, 0); + C(GPUREG_RENDERBUF_DIM, + (1 << 24) | ((surface_width - 1) << 12) | surface_height); + C(GPUREG_FRAMEBUFFER_DIM, + (1 << 24) | ((surface_width - 1) << 12) | surface_height); + C(GPUREG_FRAMEBUFFER_BLOCK32, 0); + + C(GPUREG_DEPTH_COLOR_MASK, 0xF << 8); // Write RGBA, no depth + C(GPUREG_EARLYDEPTH_TEST1, 0); + C(GPUREG_EARLYDEPTH_TEST2, 0); + C(GPUREG_COLORBUFFER_FORMAT, GPU_RGB565 << 16); + C(GPUREG_COLORBUFFER_READ, + 0x0); // Buffer is uninitialized and should not be read. + C(GPUREG_COLORBUFFER_WRITE, 0xF); + C(GPUREG_DEPTHBUFFER_READ, 0); // No depth buffer + C(GPUREG_DEPTHBUFFER_WRITE, 0); + + C(GPUREG_VIEWPORT_XY, 0); + + C(GPUREG_VIEWPORT_WIDTH, f32tof24(surface_height / 2)); + C(GPUREG_VIEWPORT_INVW, f32tof31(2.0 / ((double)surface_height)) << 1); + C(GPUREG_VIEWPORT_HEIGHT, f32tof24(surface_width / 2)); + C(GPUREG_VIEWPORT_INVH, f32tof31(2.0 / ((double)surface_width)) << 1); + + C(GPUREG_SCISSORTEST_MODE, 0); + C(GPUREG_SCISSORTEST_POS, 0); + C(GPUREG_SCISSORTEST_DIM, 0); + + C(GPUREG_DEPTHMAP_ENABLE, 1); + C(GPUREG_DEPTHMAP_SCALE, f32tof24(-1.0)); + C(GPUREG_DEPTHMAP_OFFSET, 0); + C(GPUREG_STENCIL_TEST, 0); + C(GPUREG_FRAGOP_ALPHA_TEST, 0); + C(GPUREG_LOGIC_OP, 3); + C(GPUREG_COLOR_OPERATION, 0x00E40000); + + // Texturing + C(GPUREG_TEXUNIT0_TYPE, GPU_RGB565); + C(GPUREG_TEXUNIT0_DIM, MOON_CTR_VIDEO_TEX_H | (MOON_CTR_VIDEO_TEX_W << 16)); + C(GPUREG_TEXUNIT0_ADDR1, osConvertVirtToPhys(vramTex) >> 3); + C(GPUREG_TEXUNIT0_PARAM, + GPU_NEAREST | (GPU_LINEAR << 1)); // Linear min and mag filter + + // Shading + // GPUCMD_AddMaskedWrite(GPUREG_SH_OUTATTR_CLOCK, 0x2, 1 << 8); // No Z, Yes + // texcoord0 + C(GPUREG_TEXUNIT_CONFIG, + 1 | (1 << 12) | (1 << 16)); // Activate texture 0, clear texture cache + + C(GPUREG_TEXENV0_SOURCE, 0x003003); // Texture 0 + C(GPUREG_TEXENV0_OPERAND, 0); // Source Color + C(GPUREG_TEXENV0_COMBINER, 0); // Replace + C(GPUREG_TEXENV0_SCALE, 0); // No Scale + + C(GPUREG_TEXENV1_SOURCE, 0x003003); // Texture 0 + C(GPUREG_TEXENV1_OPERAND, 0); // Source Color + C(GPUREG_TEXENV1_COMBINER, 0); // Replace + C(GPUREG_TEXENV1_SCALE, 0); // No Scale + + C(GPUREG_TEXENV2_SOURCE, 0x003003); // Texture 0 + C(GPUREG_TEXENV2_OPERAND, 0); // Source Color + C(GPUREG_TEXENV2_COMBINER, 0); // Replace + C(GPUREG_TEXENV2_SCALE, 0); // No Scale + + C(GPUREG_TEXENV3_SOURCE, 0x003003); // Texture 0 + C(GPUREG_TEXENV3_OPERAND, 0); // Source Color + C(GPUREG_TEXENV3_COMBINER, 0); // Replace + C(GPUREG_TEXENV3_SCALE, 0); // No Scale + + C(GPUREG_TEXENV4_SOURCE, 0x003003); // Texture 0 + C(GPUREG_TEXENV4_OPERAND, 0); // Source Color + C(GPUREG_TEXENV4_COMBINER, 0); // Replace + C(GPUREG_TEXENV4_SCALE, 0); // No Scale + + C(GPUREG_TEXENV5_SOURCE, 0x003003); // Texture 0 + C(GPUREG_TEXENV5_OPERAND, 0); // Source Color + C(GPUREG_TEXENV5_COMBINER, 0); // Replace + C(GPUREG_TEXENV5_SCALE, 0); // No Scale + + // Attribute buffers + C(GPUREG_ATTRIBBUFFERS_LOC, 0); + C(GPUREG_ATTRIBBUFFERS_FORMAT_LOW, 0); + C(GPUREG_ATTRIBBUFFERS_FORMAT_HIGH, + (0xFFF << 16) | (1 << 28)); // Two fixed vertex attributes + + // Vertex Shader + static DVLB_s *vshader_dvlb = NULL; + static shaderProgram_s program; + + if (!vshader_dvlb) { + vshader_dvlb = DVLB_ParseFile((u32 *)vshader_shbin, vshader_shbin_size); + shaderProgramInit(&program); + shaderProgramSetVsh(&program, &vshader_dvlb->DVLE[0]); + } + + shaderProgramUse(&program); + + C(GPUREG_VSH_NUM_ATTR, 1); // 2 attributes + GPUCMD_AddMaskedWrite(GPUREG_VSH_INPUTBUFFER_CONFIG, 0xB, + 1 | (0xA0 << 24)); // 2 attributes, no geometry shader + C(GPUREG_VSH_ATTRIBUTES_PERMUTATION_LOW, 0x00000010); + C(GPUREG_VSH_ATTRIBUTES_PERMUTATION_HIGH, 0); + + // Geometry Pipeline + C(GPUREG_FACECULLING_CONFIG, 0); + C(GPUREG_GEOSTAGE_CONFIG, 0); + GPUCMD_AddMaskedWrite(GPUREG_PRIMITIVE_CONFIG, 2, + (1 << 8) | + 1); // 2 outmap registers, drawing triangle strip + C(GPUREG_INDEXBUFFER_CONFIG, 0x80000000); + C(GPUREG_RESTART_PRIMITIVE, 1); + + // Vertex Data + GPUCMD_AddMaskedWrite(GPUREG_GEOSTAGE_CONFIG2, 1, 1); + GPUCMD_AddMaskedWrite(GPUREG_START_DRAW_FUNC0, 1, 0); + C(GPUREG_FIXEDATTRIB_INDEX, 0xF); + + union { + u32 packed[3]; + struct { + u8 x[3], y[3], z[3], w[3]; + }; + } param; + +#define ATTR(X, Y, Z, W) \ + { \ + write24(param.x, f32tof24(X)); \ + write24(param.y, f32tof24(Y)); \ + write24(param.z, f32tof24(Z)); \ + write24(param.w, f32tof24(W)); \ + \ + u32 p = param.packed[0]; \ + param.packed[0] = param.packed[2]; \ + param.packed[2] = p; \ + GPUCMD_AddIncrementalWrites(GPUREG_FIXEDATTRIB_DATA0, param.packed, \ + 3); \ + } + + float sw = image_width / 1024.0f; + float sh = image_height / 512.0f; + + // float hw = 2.0f / surface_height; + float hh = 2.0f / surface_width; + + ATTR(1.0, -1.0, 0.0, 0.0); // TR + ATTR(sw, -hh, 0.0, 0.0); + + ATTR(-1.0, -1.0, 0.0, 0.0); // TL + ATTR(sw, sh, 0.0, 0.0); + + ATTR(1.0, 1.0, 0.0, 0.0); // BR + ATTR(0.0, -hh, 0.0, 0.0); + + ATTR(-1.0, 1.0, 0.0, 0.0); // BL + ATTR(0.0, sh, 0.0, 0.0); + + // End Geometry Pipeline + GPUCMD_AddMaskedWrite(GPUREG_START_DRAW_FUNC0, 1, 1); + GPUCMD_AddMaskedWrite(GPUREG_GEOSTAGE_CONFIG2, 1, 0); + C(GPUREG_VTX_FUNC, 1); + + // Stop Command List + GPUCMD_AddMaskedWrite(GPUREG_PRIMITIVE_CONFIG, 0x8, 0x00000000); + C(GPUREG_FRAMEBUFFER_FLUSH, 1); + C(GPUREG_FRAMEBUFFER_INVALIDATE, 1); + +#undef C + + gspWaitForEvent(GSPGPU_EVENT_PPF, 0); + + u32 *unused; + u32 cmdlist_len; + GPUCMD_Split(&unused, &cmdlist_len); + GSPGPU_FlushDataCache(cmdlist, cmdlist_len); + + extern u32 __ctru_linear_heap; + extern u32 __ctru_linear_heap_size; + GX_FlushCacheRegions(cmdlist, cmdlist_len * 4, (u32 *)__ctru_linear_heap, + __ctru_linear_heap_size, NULL, 0); + + GX_ProcessCommandList(cmdlist, cmdlist_len * 4, 2); + + gspWaitForEvent(GSPGPU_EVENT_P3D, 0); + + // Copy into framebuffer, untiled + + GX_DisplayTransfer( + (u32 *)vramFb, GX_BUFFER_DIM(surface_height, surface_width), + (u32 *)dest, GX_BUFFER_DIM(surface_height, surface_width), + GX_TRANSFER_OUT_TILED(0) | + GX_TRANSFER_IN_FORMAT(GX_TRANSFER_FMT_RGB565) | + GX_TRANSFER_OUT_FORMAT(GX_TRANSFER_FMT_RGB565) | + GX_TRANSFER_SCALING(GX_TRANSFER_SCALE_NO)); + + gspWaitForEvent(GSPGPU_EVENT_PPF, 0); + + perf_fbcopy_ticks = svcGetSystemTick() - start_ticks; + + // TODO: Add config option to enable/disable this + draw_perf_counters(dest_debug, px_size); + + gfxSwapBuffers(); +} diff --git a/src/video/n3ds/N3dsRendererBottom.cpp b/src/video/n3ds/N3dsRendererBottom.cpp index 103c91b0..be950116 100644 --- a/src/video/n3ds/N3dsRendererBottom.cpp +++ b/src/video/n3ds/N3dsRendererBottom.cpp @@ -18,6 +18,7 @@ */ #include "N3dsRenderer.hpp" +#include "vshader_shbin.h" #include #include @@ -25,57 +26,16 @@ #include #include -inline int N3dsRendererBottom::get_dest_offset(int x, int y, int dest_height) { - return dest_height - y - 1 + dest_height * x; -} - -inline int N3dsRendererBottom::get_source_offset(int x, int y, int src_width, - int src_height, - int dest_width, - int dest_height) { - return (x * src_width / dest_width) + - (y * src_height / dest_height) * src_width; -} - N3dsRendererBottom::N3dsRendererBottom(int src_width, int src_height, - int px_size) - : IN3dsRenderer(GSP_SCREEN_HEIGHT_BOTTOM) { - // Generate LUTs so we don't have to calculate pixel rotation while - // streaming. - offset_lut_size = GSP_SCREEN_HEIGHT_BOTTOM * GSP_SCREEN_WIDTH; - src_offset_lut = (int *)malloc(sizeof(int) * offset_lut_size); - if (!src_offset_lut) { - throw std::runtime_error("Out of memory!\n"); - } - dest_offset_lut = (int *)malloc(sizeof(int) * offset_lut_size); - if (!dest_offset_lut) { - throw std::runtime_error("Out of memory!\n"); - } + int px_size) + : N3dsRendererBase(GSP_SCREEN_HEIGHT_BOTTOM, GSP_SCREEN_WIDTH, src_width, + src_height, px_size) {} - int i = 0; - for (int y = 0; y < GSP_SCREEN_WIDTH; ++y) { - for (int x = 0; x < GSP_SCREEN_HEIGHT_BOTTOM; ++x) { - src_offset_lut[i] = - px_size * get_source_offset(x, y, src_width, src_height, - GSP_SCREEN_HEIGHT_BOTTOM, GSP_SCREEN_WIDTH); - dest_offset_lut[i] = px_size * get_dest_offset(x, y, GSP_SCREEN_WIDTH); - i++; - } - } -} - -N3dsRendererBottom::~N3dsRendererBottom() { - if (src_offset_lut) - free(src_offset_lut); - if (dest_offset_lut) - free(dest_offset_lut); -} +N3dsRendererBottom::~N3dsRendererBottom() {} -void N3dsRendererBottom::write_px_to_framebuffer(uint8_t *source, - int px_size) { +inline void N3dsRendererBottom::write_px_to_framebuffer(uint8_t *source, + int px_size) { u8 *dest = gfxGetFramebuffer(GFX_BOTTOM, GFX_LEFT, NULL, NULL); - for (int i = 0; i < offset_lut_size; i++) { - memcpy(dest + dest_offset_lut[i], source + src_offset_lut[i], px_size); - } - gfxScreenSwapBuffers(GFX_BOTTOM, false); + u8 *dest_debug = gfxGetFramebuffer(GFX_TOP, GFX_LEFT, NULL, NULL); + write_px_to_framebuffer_gpu(source, dest, dest_debug, px_size); } diff --git a/src/video/n3ds/N3dsRendererDefault.cpp b/src/video/n3ds/N3dsRendererDefault.cpp index 3e70260b..d2cc6bd8 100644 --- a/src/video/n3ds/N3dsRendererDefault.cpp +++ b/src/video/n3ds/N3dsRendererDefault.cpp @@ -18,7 +18,9 @@ */ #include "N3dsRenderer.hpp" +#include "vshader_shbin.h" +#include <3ds.h> #include #include #include @@ -124,7 +126,8 @@ inline void N3dsRendererDefault::init_px_to_framebuffer_3d(int dest_width, N3dsRendererDefault::N3dsRendererDefault(int dest_width, int dest_height, int src_width, int src_height, int px_size) - : IN3dsRenderer(dest_width) { + : N3dsRendererBase(dest_width, dest_height, src_width, src_height, + px_size) { init_px_to_framebuffer_2d(dest_width, dest_height, src_width, src_height, px_size); init_px_to_framebuffer_3d(GSP_SCREEN_HEIGHT_TOP, dest_height, src_width, @@ -144,17 +147,9 @@ N3dsRendererDefault::~N3dsRendererDefault() { free(dest_offset_lut_3d); } -inline void N3dsRendererDefault::write_px_to_framebuffer_2D(uint8_t *source, - int px_size) { - u8 *dest = gfxGetFramebuffer(GFX_TOP, GFX_LEFT, NULL, NULL); - for (int i = 0; i < offset_lut_size; i++) { - memcpy(dest + dest_offset_lut[i], source + src_offset_lut[i], px_size); - } - gfxScreenSwapBuffers(GFX_TOP, false); -} - -inline void N3dsRendererDefault::write_px_to_framebuffer_3D(uint8_t *source, - int px_size) { +inline void +N3dsRendererDefault::write_px_to_framebuffer_3D(uint8_t *__restrict source, + int px_size) { u8 *dest = gfxGetFramebuffer(GFX_TOP, GFX_LEFT, NULL, NULL); for (int i = 0; i < offset_lut_size_3d; i++) { memcpy(dest + dest_offset_lut_3d[i], source + src_offset_lut_3d_l[i], @@ -176,6 +171,8 @@ void N3dsRendererDefault::write_px_to_framebuffer(uint8_t *source, write_px_to_framebuffer_3D(source, px_size); } else { ensure_3d_disabled(); - write_px_to_framebuffer_2D(source, px_size); + u8 *dest = gfxGetFramebuffer(GFX_TOP, GFX_LEFT, NULL, NULL); + u8 *dest_debug = gfxGetFramebuffer(GFX_BOTTOM, GFX_LEFT, NULL, NULL); + write_px_to_framebuffer_gpu(source, dest, dest_debug, px_size); } } diff --git a/src/video/n3ds/N3dsRendererDualScreen.cpp b/src/video/n3ds/N3dsRendererDualScreen.cpp index d77e1ce0..c9353d72 100644 --- a/src/video/n3ds/N3dsRendererDualScreen.cpp +++ b/src/video/n3ds/N3dsRendererDualScreen.cpp @@ -103,7 +103,8 @@ inline void N3dsRendererDualScreen::init_px_to_framebuffer_ds(int dest_width, N3dsRendererDualScreen::N3dsRendererDualScreen(int dest_width, int dest_height, int src_width, int src_height, int px_size) - : IN3dsRenderer(dest_width) { + : N3dsRendererBase(dest_width, dest_height, src_width, src_height, + px_size) { ensure_3d_disabled(); init_px_to_framebuffer_ds(dest_width, dest_height, src_width, src_height, px_size); diff --git a/src/video/n3ds_video.cpp b/src/video/n3ds_video.cpp index 33f4ba0c..80ba7435 100644 --- a/src/video/n3ds_video.cpp +++ b/src/video/n3ds_video.cpp @@ -34,9 +34,9 @@ static void *ffmpeg_buffer; static size_t ffmpeg_buffer_size; static int image_width, image_height, surface_width, surface_height, pixel_size; -static u8 *img_buffer; +static u8 *rgb_img_buffer; -static std::unique_ptr renderer = nullptr; +static std::unique_ptr renderer = nullptr; enum n3ds_render_type N3DS_RENDER_TYPE = RENDER_DEFAULT; static int n3ds_setup(int videoFormat, int width, int height, int redrawRate, @@ -80,9 +80,9 @@ static int n3ds_setup(int videoFormat, int width, int height, int redrawRate, image_width = width; image_height = height; pixel_size = gspGetBytesPerPixel(px_fmt); - - img_buffer = (u8 *)linearAlloc(width * height * pixel_size); - if (!img_buffer) { + rgb_img_buffer = (u8 *)linearAlloc(MOON_CTR_VIDEO_TEX_W * + MOON_CTR_VIDEO_TEX_H * pixel_size); + if (!rgb_img_buffer) { fprintf(stderr, "Out of memory!\n"); return -1; } @@ -109,7 +109,7 @@ static int n3ds_setup(int videoFormat, int width, int height, int redrawRate, static void n3ds_cleanup() { ffmpeg_destroy(); y2rExit(); - linearFree(img_buffer); + linearFree(rgb_img_buffer); renderer = nullptr; } @@ -136,7 +136,7 @@ static inline int write_yuv_to_framebuffer(const u8 **source, int width, goto y2ru_failed; } - status = Y2RU_SetReceiving(img_buffer, width * height * px_size, 8, 0); + status = Y2RU_SetReceiving(rgb_img_buffer, width * height * px_size, 8, 0); if (status) { fprintf(stderr, "Y2RU_SetReceiving failed\n"); goto y2ru_failed; @@ -157,7 +157,7 @@ static inline int write_yuv_to_framebuffer(const u8 **source, int width, svcWaitSynchronization(conversion_finish_event_handle, 10000000); // Wait up to 10ms. svcCloseHandle(conversion_finish_event_handle); - renderer->write_px_to_framebuffer(img_buffer, px_size); + renderer->write_px_to_framebuffer(rgb_img_buffer, px_size); return DR_OK; y2ru_failed: @@ -179,9 +179,6 @@ static int n3ds_submit_decode_unit(PDECODE_UNIT decodeUnit) { ffmpeg_decode((unsigned char *)ffmpeg_buffer, length); AVFrame *frame = ffmpeg_get_frame(false); - // This is where we're erroring out? - // I was running the SW decoder too hard. Still, we should upgrade to C++ - // for exception handling. int status = write_yuv_to_framebuffer((const u8 **)frame->data, image_width, image_height, pixel_size); diff --git a/src/video/n3ds_video_mvd.cpp b/src/video/n3ds_video_mvd.cpp index 6aaf3ff5..36a72323 100644 --- a/src/video/n3ds_video_mvd.cpp +++ b/src/video/n3ds_video_mvd.cpp @@ -33,24 +33,17 @@ #include #define N3DS_DEC_BUFF_SIZE 23 -// Best performing transfer size (optimized through experimentation) -#define N3DS_YUYV_XFER_UNIT 800 -// Wait up to 20ms for YUYV conversion to complete (optimized through -// experimentation) -#define N3DS_YUYV_CONV_WAIT_NS 20000000 // General decoder and renderer state static void *nal_unit_buffer; static size_t nal_unit_buffer_size; static MVDSTD_Config mvdstd_config; -Handle conversion_finish_event_handle = NULL; static int image_width, image_height, surface_width, surface_height, pixel_size; -static u8 *yuv_img_buffer; static u8 *rgb_img_buffer; static bool first_frame = true; -static std::unique_ptr renderer = nullptr; +static std::unique_ptr renderer = nullptr; static int n3ds_init(int videoFormat, int width, int height, int redrawRate, void *context, int drFlags) { @@ -62,7 +55,7 @@ static int n3ds_init(int videoFormat, int width, int height, int redrawRate, } int status = - mvdstdInit(MVDMODE_VIDEOPROCESSING, MVD_INPUT_H264, MVD_OUTPUT_YUYV422, + mvdstdInit(MVDMODE_VIDEOPROCESSING, MVD_INPUT_H264, MVD_OUTPUT_BGR565, width * height * N3DS_DEC_BUFF_SIZE, NULL); if (status) { fprintf(stderr, "mvdstdInit failed: %d\n", status); @@ -70,30 +63,6 @@ static int n3ds_init(int videoFormat, int width, int height, int redrawRate, return -1; } - if (y2rInit()) { - fprintf(stderr, "Failed to initialize Y2R\n"); - return -1; - } - Y2RU_ConversionParams y2r_parameters; - y2r_parameters.input_format = INPUT_YUV422_BATCH; - y2r_parameters.output_format = OUTPUT_RGB_16_565; - y2r_parameters.rotation = ROTATION_NONE; - y2r_parameters.block_alignment = BLOCK_LINE; - y2r_parameters.input_line_width = width; - y2r_parameters.input_lines = height; - y2r_parameters.standard_coefficient = COEFFICIENT_ITU_R_BT_709_SCALING; - y2r_parameters.alpha = 0xFF; - status = Y2RU_SetConversionParams(&y2r_parameters); - if (status) { - fprintf(stderr, "Failed to set Y2RU params\n"); - return -1; - } - status = Y2RU_SetTransferEndInterrupt(true); - if (status) { - fprintf(stderr, "Failed to enable Y2RU interrupt\n"); - return -1; - } - surface_height = GSP_SCREEN_WIDTH; if (width > GSP_SCREEN_HEIGHT_TOP) { surface_width = GSP_SCREEN_HEIGHT_TOP_2X; @@ -105,12 +74,8 @@ static int n3ds_init(int videoFormat, int width, int height, int redrawRate, image_width = width; image_height = height; pixel_size = gspGetBytesPerPixel(px_fmt); - yuv_img_buffer = (u8 *)linearAlloc(width * height * pixel_size); - if (!yuv_img_buffer) { - fprintf(stderr, "Out of memory!\n"); - return -1; - } - rgb_img_buffer = (u8 *)linearAlloc(width * height * pixel_size); + rgb_img_buffer = (u8 *)linearAlloc(MOON_CTR_VIDEO_TEX_W * + MOON_CTR_VIDEO_TEX_H * pixel_size); if (!rgb_img_buffer) { fprintf(stderr, "Out of memory!\n"); return -1; @@ -121,7 +86,12 @@ static int n3ds_init(int videoFormat, int width, int height, int redrawRate, AV_INPUT_BUFFER_PADDING_SIZE); mvdstdGenerateDefaultConfig(&mvdstd_config, image_width, image_height, image_width, image_height, NULL, - (u32 *)yuv_img_buffer, NULL); + (u32 *)rgb_img_buffer, NULL); + + // Place within the 1024x512 buffer + mvdstd_config.flag_x104 = 1; + mvdstd_config.output_width_override = MOON_CTR_VIDEO_TEX_W; + mvdstd_config.output_height_override = MOON_CTR_VIDEO_TEX_H; MVDSTD_SetConfig(&mvdstd_config); switch (N3DS_RENDER_TYPE) { @@ -149,43 +119,10 @@ static void n3ds_destroy(void) { y2rExit(); mvdstdExit(); linearFree(nal_unit_buffer); - linearFree(yuv_img_buffer); linearFree(rgb_img_buffer); renderer = nullptr; } -static inline int yuv_to_rgb(u8 *dest, const u8 *source, int width, int height, - int px_size) { - int status = - Y2RU_SetSendingYUYV(source, width * height * 2, N3DS_YUYV_XFER_UNIT, 0); - if (status) { - fprintf(stderr, "Y2RU_SetSendingYUYV failed\n"); - goto y2ru_failed; - } - - status = Y2RU_SetReceiving(dest, width * height * px_size, 8, 0); - if (status) { - fprintf(stderr, "Y2RU_SetReceiving failed\n"); - goto y2ru_failed; - } - - status = Y2RU_StartConversion(); - if (status) { - fprintf(stderr, "Y2RU_StartConversion failed\n"); - goto y2ru_failed; - } - - status = Y2RU_GetTransferEndEvent(&conversion_finish_event_handle); - if (status) { - fprintf(stderr, "Y2RU_GetTransferEndEvent failed\n"); - goto y2ru_failed; - } - return DR_OK; - -y2ru_failed: - return -1; -} - // packets must be decoded in order // indata must be inlen + AV_INPUT_BUFFER_PADDING_SIZE in length static inline int n3ds_decode(unsigned char *indata, int inlen) { @@ -197,6 +134,7 @@ static inline int n3ds_decode(unsigned char *indata, int inlen) { } static int n3ds_submit_decode_unit(PDECODE_UNIT decodeUnit) { + u64 start_ticks = svcGetSystemTick(); PLENTRY entry = decodeUnit->bufferList; int length = 0; @@ -211,17 +149,10 @@ static int n3ds_submit_decode_unit(PDECODE_UNIT decodeUnit) { } GSPGPU_FlushDataCache(nal_unit_buffer, length); - if (conversion_finish_event_handle != NULL) { - svcWaitSynchronization(conversion_finish_event_handle, - N3DS_YUYV_CONV_WAIT_NS); - svcCloseHandle(conversion_finish_event_handle); - - renderer->write_px_to_framebuffer(rgb_img_buffer, pixel_size); - } - n3ds_decode((unsigned char *)nal_unit_buffer, length); - yuv_to_rgb(rgb_img_buffer, yuv_img_buffer, image_width, image_height, - pixel_size); + renderer->perf_decode_ticks = svcGetSystemTick() - start_ticks; + + renderer->write_px_to_framebuffer(rgb_img_buffer, pixel_size); // If MVD never gets an IDR frame, everything shows up gray if (first_frame) { diff --git a/src/video/video.h b/src/video/video.h index 52779ff0..2db87a4c 100644 --- a/src/video/video.h +++ b/src/video/video.h @@ -52,8 +52,11 @@ extern DECODER_RENDERER_CALLBACKS decoder_callbacks_x11_vdpau; #endif #endif #ifdef __3DS__ +#include <3ds/types.h> + enum n3ds_render_type { RENDER_DEFAULT, RENDER_BOTTOM, RENDER_DUAL_SCREEN }; extern enum n3ds_render_type N3DS_RENDER_TYPE; + extern DECODER_RENDERER_CALLBACKS decoder_callbacks_n3ds; extern DECODER_RENDERER_CALLBACKS decoder_callbacks_n3ds_mvd; #endif diff --git a/src/video/vshader.v.pica b/src/video/vshader.v.pica new file mode 100644 index 00000000..7704f02d --- /dev/null +++ b/src/video/vshader.v.pica @@ -0,0 +1,31 @@ +; Example PICA200 vertex shader + +; Uniforms +.fvec projection[4] + +; Constants +.constf myconst(0.0, 1.0, -1.0, 0.5) +.alias zeros myconst.xxxx ; Vector full of zeros +.alias ones myconst.yyyy ; Vector full of ones + +; Outputs +.out outpos position +.out outtc0 texcoord0 + +; Inputs (defined as aliases for convenience) +.alias inpos v0 +.alias intex v1 + +.proc main + ; Force the w component of inpos to be 1.0 + mov r0.xyz, inpos + mov r0.w, ones + + mov outpos, r0 + + ; outtc0 = intex + mov outtc0, intex + + ; We're finished + end +.end