From 1fcd64b65c10de5a11564a7f0a89904aa9abe556 Mon Sep 17 00:00:00 2001 From: Ivan Avdeev Date: Tue, 5 Dec 2023 13:00:55 -0500 Subject: [PATCH 1/3] vk: rt: add very experimental shader clock/timing output Doesn't work that well yet. --- ref/vk/shaders/denoiser.comp | 6 ++++++ ref/vk/shaders/ray_interop.h | 2 ++ ref/vk/shaders/ray_light_direct.glsl | 21 +++++++++++++++++++-- ref/vk/shaders/ray_light_direct_point.comp | 2 ++ ref/vk/shaders/ray_light_direct_poly.comp | 2 ++ ref/vk/vk_core.c | 15 +++++++++++++-- ref/vk/vk_rtx.c | 2 ++ 7 files changed, 46 insertions(+), 4 deletions(-) diff --git a/ref/vk/shaders/denoiser.comp b/ref/vk/shaders/denoiser.comp index 7d2b7e76b..da75f820a 100644 --- a/ref/vk/shaders/denoiser.comp +++ b/ref/vk/shaders/denoiser.comp @@ -220,6 +220,12 @@ void main() { readNormals(pix, geometry_normal, shading_normal); imageStore(out_dest, pix, vec4(.5 + geometry_normal * .5, 0.)); return; + } else if (ubo.ubo.debug_display_only == DEBUG_DISPLAY_TIME_DIRECT_POLY) { + imageStore(out_dest, pix, vec4(imageLoad(light_poly_diffuse, pix).a)); return; + return; + } else if (ubo.ubo.debug_display_only == DEBUG_DISPLAY_TIME_DIRECT_POINT) { + imageStore(out_dest, pix, vec4(imageLoad(light_point_diffuse, pix).a)); return; + return; } #ifdef DEBUG_NOISE diff --git a/ref/vk/shaders/ray_interop.h b/ref/vk/shaders/ray_interop.h index 728cbc2e4..c28aea75b 100644 --- a/ref/vk/shaders/ray_interop.h +++ b/ref/vk/shaders/ray_interop.h @@ -192,6 +192,8 @@ struct PushConstants { #define DEBUG_DISPLAY_INDIRECT_SPEC 10 #define DEBUG_DISPLAY_INDIRECT_DIFF 11 #define DEBUG_DISPLAY_TRIHASH 12 +#define DEBUG_DISPLAY_TIME_DIRECT_POLY 13 +#define DEBUG_DISPLAY_TIME_DIRECT_POINT 14 // add more when needed struct UniformBuffer { diff --git a/ref/vk/shaders/ray_light_direct.glsl b/ref/vk/shaders/ray_light_direct.glsl index e56bf42d1..3ba2cedc0 100644 --- a/ref/vk/shaders/ray_light_direct.glsl +++ b/ref/vk/shaders/ray_light_direct.glsl @@ -1,8 +1,13 @@ +#extension GL_ARB_shader_clock: enable +#extension GL_EXT_shader_realtime_clock: enable +#extension GL_ARB_gpu_shader_int64: enable #include "utils.glsl" #include "noise.glsl" #include "ray_kusochki.glsl" +vec4 profile = vec4(0.); + #include "light.glsl" void readNormals(ivec2 uv, out vec3 geometry_normal, out vec3 shading_normal) { @@ -11,7 +16,12 @@ void readNormals(ivec2 uv, out vec3 geometry_normal, out vec3 shading_normal) { shading_normal = normalDecode(n.zw); } +//#define timeNow clockRealtimeEXT +#define timeNow clockARB + void main() { + const uint64_t time_begin = timeNow(); + #ifdef RAY_TRACE const vec2 uv = (gl_LaunchIDEXT.xy + .5) / gl_LaunchSizeEXT.xy * 2. - 1.; const ivec2 pix = ivec2(gl_LaunchIDEXT.xy); @@ -49,13 +59,20 @@ void main() { vec3 diffuse = vec3(0.), specular = vec3(0.); computeLighting(pos + geometry_normal * .001, shading_normal, throughput, -direction, material, diffuse, specular); + const uint64_t time_end = timeNow(); + const uint64_t time_diff = time_end - time_begin; + + const float time_diff_f = float(time_diff) / 1e6;////float(time_diff >> 60);// / 1e6; + #if LIGHT_POINT - imageStore(out_light_point_diffuse, pix, vec4(diffuse, 0.f)); + imageStore(out_light_point_diffuse, pix, vec4(diffuse, time_diff_f)); imageStore(out_light_point_specular, pix, vec4(specular, 0.f)); + //imageStore(out_light_point_profile, pix, profile); #endif #if LIGHT_POLYGON - imageStore(out_light_poly_diffuse, pix, vec4(diffuse, 0.f)); + imageStore(out_light_poly_diffuse, pix, vec4(diffuse, time_diff_f)); imageStore(out_light_poly_specular, pix, vec4(specular, 0.f)); + //imageStore(out_light_poly_profile, pix, profile); #endif } diff --git a/ref/vk/shaders/ray_light_direct_point.comp b/ref/vk/shaders/ray_light_direct_point.comp index 3869987bd..d1130b180 100644 --- a/ref/vk/shaders/ray_light_direct_point.comp +++ b/ref/vk/shaders/ray_light_direct_point.comp @@ -31,6 +31,8 @@ layout(set = 0, binding = 31, std430) readonly buffer Kusochki { Kusok a[]; } ku layout(set = 0, binding = 32, std430) readonly buffer Indices { uint16_t a[]; } indices; layout(set = 0, binding = 33, std430) readonly buffer Vertices { Vertex a[]; } vertices; +//layout(set = 0, binding = 34, rgba16f) uniform writeonly image2D out_light_point_profile; + #define RAY_QUERY #define LIGHT_POINT 1 #include "ray_light_direct.glsl" diff --git a/ref/vk/shaders/ray_light_direct_poly.comp b/ref/vk/shaders/ray_light_direct_poly.comp index f5c595ad7..325241037 100644 --- a/ref/vk/shaders/ray_light_direct_poly.comp +++ b/ref/vk/shaders/ray_light_direct_poly.comp @@ -31,6 +31,8 @@ layout(set = 0, binding = 31, std430) readonly buffer Kusochki { Kusok a[]; } ku layout(set = 0, binding = 32, std430) readonly buffer Indices { uint16_t a[]; } indices; layout(set = 0, binding = 33, std430) readonly buffer Vertices { Vertex a[]; } vertices; +layout(set = 0, binding = 34, rgba16f) uniform writeonly image2D out_light_poly_profile; + #define RAY_QUERY #define LIGHT_POLYGON 1 #include "ray_light_direct.glsl" diff --git a/ref/vk/vk_core.c b/ref/vk/vk_core.c index ee9023a22..130364b93 100644 --- a/ref/vk/vk_core.c +++ b/ref/vk/vk_core.c @@ -98,6 +98,9 @@ static const char* device_extensions_rt[] = { VK_KHR_RAY_TRACING_PIPELINE_EXTENSION_NAME, VK_KHR_DEFERRED_HOST_OPERATIONS_EXTENSION_NAME, VK_KHR_RAY_QUERY_EXTENSION_NAME, + + // TODO optional + VK_KHR_SHADER_CLOCK_EXTENSION_NAME, }; static const char* device_extensions_nv_checkpoint[] = { @@ -526,9 +529,16 @@ static qboolean createDevice( void ) { .pNext = head, .rayQuery = VK_TRUE, }; + head = &ray_query_pipeline_feature; + VkPhysicalDeviceShaderClockFeaturesKHR shader_clock_feature = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_CLOCK_FEATURES_KHR, + .pNext = head, + .shaderDeviceClock = VK_TRUE, + .shaderSubgroupClock = VK_TRUE, + }; if (vk_core.rtx) { - head = &ray_query_pipeline_feature; + head = &shader_clock_feature; } else { head = NULL; } @@ -537,7 +547,8 @@ static qboolean createDevice( void ) { .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2, .pNext = head, .features.samplerAnisotropy = candidate_device->features.features.samplerAnisotropy, - .features.shaderInt16 = true, + .features.shaderInt16 = VK_TRUE, + .features.shaderInt64 = VK_TRUE, }; head = &features; diff --git a/ref/vk/vk_rtx.c b/ref/vk/vk_rtx.c index b1d230a9a..2eb707fd7 100644 --- a/ref/vk/vk_rtx.c +++ b/ref/vk/vk_rtx.c @@ -167,6 +167,8 @@ static void parseDebugDisplayValue( void ) { X(INDIRECT) \ X(INDIRECT_SPEC) \ X(INDIRECT_DIFF) \ + X(TIME_DIRECT_POLY) \ + X(TIME_DIRECT_POINT) \ #define X(suffix) \ if (0 == Q_stricmp(cvalue, #suffix)) { \ From cbdbda3549046063fda4a9ce2fce68fed27ee645 Mon Sep 17 00:00:00 2001 From: Ivan Avdeev Date: Thu, 7 Dec 2023 12:38:19 -0500 Subject: [PATCH 2/3] vk: rt: extract raw shader clock values also add observations to the shader, what values do we see --- ref/vk/NOTES.md | 13 ++++ ref/vk/TODO.md | 6 ++ ref/vk/shaders/ray_interop.h | 12 +++- ref/vk/shaders/ray_light_direct.glsl | 48 +++++++++++--- ref/vk/shaders/ray_light_direct_point.comp | 2 + ref/vk/shaders/ray_light_direct_poly.comp | 3 +- ref/vk/vk_rtx.c | 76 +++++++++++++++++++--- 7 files changed, 139 insertions(+), 21 deletions(-) diff --git a/ref/vk/NOTES.md b/ref/vk/NOTES.md index 376d3fd8e..bd732883a 100644 --- a/ref/vk/NOTES.md +++ b/ref/vk/NOTES.md @@ -1011,3 +1011,16 @@ This would need the same as above, plus: - A: probably should still do it on GPU lol This would also allow passing arbitrary per-pixel data from shaders, which would make shader debugging much much easier. + +# 2023-12-07 E343 +## What do we really need for shader profiling +### Optimizing polygon light sampling +- Per-pixel numbers: + - Total shader time + - Sampling selection time (Σ) + - Selecting lights to sample (+count) + - Selecting light point to sample + - Vertices count + - Ray tracing time (Σ, +count) +- Aggregate numbers: + - TODO: what does VK_KHR_performance_query give us? Regs usage, etc. diff --git a/ref/vk/TODO.md b/ref/vk/TODO.md index 3328313b2..54b9afb6e 100644 --- a/ref/vk/TODO.md +++ b/ref/vk/TODO.md @@ -1,3 +1,9 @@ +# 2023-12-07 E343 +- [x] extract raw shader clock +- [ ] display times as scopes somewhere +- [ ] extract sampling times +- [ ] extract ray times + # 2023-12-05 E342 - [x] tone down the specular indirect blur - [-] try func_wall static light opt, #687 diff --git a/ref/vk/shaders/ray_interop.h b/ref/vk/shaders/ray_interop.h index c28aea75b..4c4db0fa0 100644 --- a/ref/vk/shaders/ray_interop.h +++ b/ref/vk/shaders/ray_interop.h @@ -21,13 +21,15 @@ #define vec3 vec3_t #define vec4 vec4_t #define mat4 matrix4x4 -typedef int ivec3[3]; -typedef int ivec2[2]; +typedef int32_t ivec3[3]; +typedef int32_t ivec2[2]; +typedef uint32_t uvec2[2]; +typedef uint32_t uvec3[3]; +typedef uint32_t uvec4[4]; #define TOKENPASTE(x, y) x ## y #define TOKENPASTE2(x, y) TOKENPASTE(x, y) #define PAD(x) float TOKENPASTE2(pad_, __LINE__)[x]; #define STRUCT struct - enum { #define DECLARE_SPECIALIZATION_CONSTANT(index, type, name, default_value) \ SPEC_##name##_INDEX = index, @@ -207,6 +209,10 @@ struct UniformBuffer { uint debug_display_only; }; +struct ProfilingStruct { + uvec4 data[4]; +}; + #undef PAD #undef STRUCT diff --git a/ref/vk/shaders/ray_light_direct.glsl b/ref/vk/shaders/ray_light_direct.glsl index 3ba2cedc0..1c423f745 100644 --- a/ref/vk/shaders/ray_light_direct.glsl +++ b/ref/vk/shaders/ray_light_direct.glsl @@ -1,6 +1,12 @@ -#extension GL_ARB_shader_clock: enable -#extension GL_EXT_shader_realtime_clock: enable +#define PROF_USE_REALTIME +#ifdef PROF_USE_REALTIME #extension GL_ARB_gpu_shader_int64: enable +#extension GL_EXT_shader_realtime_clock: enable +#else +#extension GL_ARB_shader_clock: enable +#endif + + #include "utils.glsl" #include "noise.glsl" @@ -16,11 +22,25 @@ void readNormals(ivec2 uv, out vec3 geometry_normal, out vec3 shading_normal) { shading_normal = normalDecode(n.zw); } -//#define timeNow clockRealtimeEXT -#define timeNow clockARB +#ifdef PROF_USE_REALTIME +// On mesa+amdgpu there's a clear gradient: pixels on top of screen take 2-3x longer to compute than bottom ones. Also, +// it does flicker a lot. +// Deltas are about 30000-100000 parrots +#define timeNow clockRealtime2x32EXT +#else +// clockARB doesn't give directly usable time values on mesa+amdgpu +// even deltas between them are not meaningful enough. +// On mesa+amdgpu clockARB() values are limited to lower 20bits, and they wrap around a lot. +// Absolute difference value are often 30-50% of the available range, so it's not that far off from wrapping around +// multiple times, rendering the value completely useless. +// Deltas are around 300000-500000 parrots. +// Other than that, the values seem uniform across the screen (as compared to realtime clock, which has a clearly +// visible gradient: top differences are larger than bottom ones. +#define timeNow clock2x32ARB +#endif void main() { - const uint64_t time_begin = timeNow(); + const uvec2 time_begin = timeNow(); #ifdef RAY_TRACE const vec2 uv = (gl_LaunchIDEXT.xy + .5) / gl_LaunchSizeEXT.xy * 2. - 1.; @@ -59,20 +79,32 @@ void main() { vec3 diffuse = vec3(0.), specular = vec3(0.); computeLighting(pos + geometry_normal * .001, shading_normal, throughput, -direction, material, diffuse, specular); - const uint64_t time_end = timeNow(); - const uint64_t time_diff = time_end - time_begin; + const uvec2 time_end = timeNow(); + //const uint64_t time_diff = time_end - time_begin; + //const uint time_diff = time_begin.x - time_end.x; - const float time_diff_f = float(time_diff) / 1e6;////float(time_diff >> 60);// / 1e6; +#ifdef PROF_USE_REALTIME + const uint64_t begin64 = time_begin.x | (uint64_t(time_begin.y) << 32); + const uint64_t end64 = time_end.x | (uint64_t(time_end.y) << 32); + const uint64_t time_diff = end64 - begin64; + const float time_diff_f = float(time_diff) / 1e5; +#else + const uint time_diff = time_begin.x - time_end.x; + const float time_diff_f = float(time_diff & 0xfffffu) / 1e6; +#endif + const uint prof_index = pix.x + pix.y * ubo.ubo.res.x; #if LIGHT_POINT imageStore(out_light_point_diffuse, pix, vec4(diffuse, time_diff_f)); imageStore(out_light_point_specular, pix, vec4(specular, 0.f)); //imageStore(out_light_point_profile, pix, profile); + //prof_direct_point[prof_index].data[0] = vec4(time_begin, time_end); #endif #if LIGHT_POLYGON imageStore(out_light_poly_diffuse, pix, vec4(diffuse, time_diff_f)); imageStore(out_light_poly_specular, pix, vec4(specular, 0.f)); //imageStore(out_light_poly_profile, pix, profile); + prof_direct_poly.a[prof_index].data[0] = uvec4(time_begin, time_end); #endif } diff --git a/ref/vk/shaders/ray_light_direct_point.comp b/ref/vk/shaders/ray_light_direct_point.comp index d1130b180..7081712fc 100644 --- a/ref/vk/shaders/ray_light_direct_point.comp +++ b/ref/vk/shaders/ray_light_direct_point.comp @@ -32,6 +32,8 @@ layout(set = 0, binding = 32, std430) readonly buffer Indices { uint16_t a[]; } layout(set = 0, binding = 33, std430) readonly buffer Vertices { Vertex a[]; } vertices; //layout(set = 0, binding = 34, rgba16f) uniform writeonly image2D out_light_point_profile; +//layout(set = 0, binding = 34, std430) writeonly buffer Profiling { ProfilingStruct prof_direct_point[]; }; + #define RAY_QUERY #define LIGHT_POINT 1 diff --git a/ref/vk/shaders/ray_light_direct_poly.comp b/ref/vk/shaders/ray_light_direct_poly.comp index 325241037..f0568c867 100644 --- a/ref/vk/shaders/ray_light_direct_poly.comp +++ b/ref/vk/shaders/ray_light_direct_poly.comp @@ -31,7 +31,8 @@ layout(set = 0, binding = 31, std430) readonly buffer Kusochki { Kusok a[]; } ku layout(set = 0, binding = 32, std430) readonly buffer Indices { uint16_t a[]; } indices; layout(set = 0, binding = 33, std430) readonly buffer Vertices { Vertex a[]; } vertices; -layout(set = 0, binding = 34, rgba16f) uniform writeonly image2D out_light_poly_profile; +//layout(set = 0, binding = 34, rgba16f) uniform writeonly image2D out_light_poly_profile; +layout(set = 0, binding = 34, std430) writeonly restrict buffer Profiling { ProfilingStruct a[]; } prof_direct_poly; #define RAY_QUERY #define LIGHT_POLYGON 1 diff --git a/ref/vk/vk_rtx.c b/ref/vk/vk_rtx.c index 2eb707fd7..c22e718b5 100644 --- a/ref/vk/vk_rtx.c +++ b/ref/vk/vk_rtx.c @@ -45,7 +45,8 @@ X(Buffer, light_grid) \ X(Texture, textures) \ X(Texture, skybox) \ - X(Texture, blue_noise_texture) + X(Texture, blue_noise_texture) \ + X(Buffer, prof_direct_poly) \ enum { #define RES_ENUM(type, name) ExternalResource_##name, @@ -80,10 +81,13 @@ static struct { matrix4x4 prev_inv_proj, prev_inv_view; qboolean reload_pipeline; + qboolean dump_profiling_data; qboolean discontinuity; int max_frame_width, max_frame_height; + vk_buffer_t prof_direct_poly; + struct { cvar_t *rt_debug_display_only; uint32_t rt_debug_display_only_value; @@ -267,6 +271,9 @@ static void performTracing( vk_combuf_t *combuf, const perform_tracing_args_t* a RES_SET_SBUFFER_FULL(indices, args->render_args->geometry_data); RES_SET_SBUFFER_FULL(vertices, args->render_args->geometry_data); + // TODO register this one properly + RES_SET_SBUFFER_FULL(prof_direct_poly, g_rtx.prof_direct_poly); + // TODO move this to lights RES_SET_BUFFER(lights, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, args->light_bindings->buffer, args->light_bindings->metadata.offset, args->light_bindings->metadata.size); RES_SET_BUFFER(light_grid, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, args->light_bindings->buffer, args->light_bindings->grid.offset, args->light_bindings->grid.size); @@ -593,6 +600,35 @@ static void reloadMainpipe(void) { R_VkMeatpipeDestroy(newpipe); } +static void dumpProfilingData(int w, int h) { + FILE *f = fopen("gpuprof.dump", "w"); + for (int y = 0; y < h; ++y) { + const struct ProfilingStruct *s = ((const struct ProfilingStruct*)(g_rtx.prof_direct_poly.mapped)) + y * g_rtx.max_frame_width; + for (int x = 0; x < w; ++x, ++s) { + const unsigned long long begin = s->data[0][0] | ((uint64_t)(s->data[0][1]) << 32); + const unsigned long long end = s->data[0][2] | ((uint64_t)(s->data[0][3]) << 32); + fprintf(f, "x=%d y=%d %016llx %016llx %llu %llu d=%llu\n", x, y, begin, end, begin, end, end - begin); + } + } + fclose(f); +} + +static void resizeResources(void) { + // TODO resize textures/images + + if (g_rtx.prof_direct_poly.buffer != VK_NULL_HANDLE) { + VK_BufferDestroy(&g_rtx.prof_direct_poly); + } + + const uint32_t size = sizeof(struct ProfilingStruct) * g_rtx.max_frame_width * g_rtx.max_frame_height; + const VkBufferUsageFlags usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; + const VkMemoryPropertyFlags memf = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; + + if (!VK_BufferCreate("prof_direct_poly", &g_rtx.prof_direct_poly, size, usage, memf)) { + gEngine.Host_Error("Failed to recreate prof_direct_poly"); + } +} + void VK_RayFrameEnd(const vk_ray_frame_render_args_t* args) { APROF_SCOPE_DECLARE_BEGIN(ray_frame_end, __FUNCTION__); @@ -612,26 +648,40 @@ void VK_RayFrameEnd(const vk_ray_frame_render_args_t* args) // if (vk_core.debug) // XVK_RayModel_Validate(); - qboolean need_reload = g_rtx.reload_pipeline; + qboolean resize = false; if (g_rtx.max_frame_width < args->dst.width) { g_rtx.max_frame_width = ALIGN_UP(args->dst.width, 16); WARN("Increasing max_frame_width to %d", g_rtx.max_frame_width); - // TODO only reload resources, no need to reload the entire pipeline - need_reload = true; + resize = true; } if (g_rtx.max_frame_height < args->dst.height) { g_rtx.max_frame_height = ALIGN_UP(args->dst.height, 16); WARN("Increasing max_frame_height to %d", g_rtx.max_frame_height); - // TODO only reload resources, no need to reload the entire pipeline - need_reload = true; + resize = true; } - if (need_reload) { + // TODO only reload resources, no need to reload the entire pipeline + const qboolean need_reload = g_rtx.reload_pipeline || resize; + + if (need_reload || resize || g_rtx.dump_profiling_data) { WARN("Reloading RTX shaders/pipelines"); XVK_CHECK(vkDeviceWaitIdle(vk_core.device)); + } + + if (g_rtx.dump_profiling_data) { + // TODO using next frame w/h is not correct, need previous w/h + dumpProfilingData(args->dst.width, args->dst.height); + g_rtx.dump_profiling_data = false; + } + if (resize) { + // Resize after wait idle + resizeResources(); + } + + if (need_reload) { reloadMainpipe(); g_rtx.reload_pipeline = false; @@ -692,10 +742,14 @@ void VK_RayFrameEnd(const vk_ray_frame_render_args_t* args) g_rtx.discontinuity = false; } -static void reloadPipeline( void ) { +static void cmdReloadPipeline( void ) { g_rtx.reload_pipeline = true; } +static void cmdDumpProfilingData( void ) { + g_rtx.dump_profiling_data = true; +} + qboolean VK_RayInit( void ) { ASSERT(vk_core.rtx); @@ -754,7 +808,10 @@ qboolean VK_RayInit( void ) RT_RayModel_Clear(); - gEngine.Cmd_AddCommand("rt_debug_reload_pipelines", reloadPipeline, "Reload RT pipelines"); + resizeResources(); + + gEngine.Cmd_AddCommand("rt_debug_reload_pipelines", cmdReloadPipeline, "Reload RT pipelines"); + gEngine.Cmd_AddCommand("rt_debug_prof_dump", cmdDumpProfilingData, "Dump profiling data"); #define X(name) #name ", " g_rtx.debug.rt_debug_display_only = gEngine.Cvar_Get("rt_debug_display_only", "", FCVAR_GLCONFIG, @@ -775,6 +832,7 @@ void VK_RayShutdown( void ) { VK_BufferDestroy(&g_ray_model_state.model_headers_buffer); VK_BufferDestroy(&g_ray_model_state.kusochki_buffer); VK_BufferDestroy(&g_rtx.uniform_buffer); + VK_BufferDestroy(&g_rtx.prof_direct_poly); RT_VkAccelShutdown(); RT_DynamicModelShutdown(); From 6b51fde5b2ff17817f8d0a1e6884a26b842b82ac Mon Sep 17 00:00:00 2001 From: Ivan Avdeev Date: Fri, 8 Dec 2023 13:20:12 -0500 Subject: [PATCH 3/3] vk: rt: add more shader clock timing experiments and results See notes and stream E344 for data and comments --- ref/vk/NOTES.md | 76 +++++++++++++++++++++ ref/vk/TODO.md | 15 ++++- ref/vk/shaders/light_polygon.glsl | 17 ++++- ref/vk/shaders/ray_light_direct.glsl | 46 +++---------- ref/vk/shaders/time.glsl | 53 +++++++++++++++ ref/vk/vk_rtx.c | 98 +++++++++++++++++++++++++++- 6 files changed, 261 insertions(+), 44 deletions(-) create mode 100644 ref/vk/shaders/time.glsl diff --git a/ref/vk/NOTES.md b/ref/vk/NOTES.md index bd732883a..68d46e0b2 100644 --- a/ref/vk/NOTES.md +++ b/ref/vk/NOTES.md @@ -1024,3 +1024,79 @@ This would also allow passing arbitrary per-pixel data from shaders, which would - Ray tracing time (Σ, +count) - Aggregate numbers: - TODO: what does VK_KHR_performance_query give us? Regs usage, etc. + +# 2023-12-08 E344 +## Experiment one +some save in test_brush2, default PROJECTED sampling + 0-5%: s=0(0, 0.00%) r=133495(133495, 13.18%) + 5-10%: s=0(0, 0.00%) r=204348(337843, 33.35%) + 10-15%: s=0(0, 0.00%) r=92673(430516, 42.49%) + 15-20%: s=0(0, 0.00%) r=25196(455712, 44.98%) + 20-25%: s=67(67, 0.01%) r=3529(459241, 45.33%) + 25-30%: s=253(320, 0.03%) r=12018(471259, 46.52%) + 30-35%: s=319(639, 0.06%) r=39805(511064, 50.45%) + 35-40%: s=2096(2735, 0.27%) r=178843(689907, 68.10%) + 40-45%: s=8753(11488, 1.13%) r=270099(960006, 94.76%) + 45-50%: s=53958(65446, 6.46%) r=44000(1004006, 99.10%) + 50-55%: s=275150(340596, 33.62%) r=8256(1012262, 99.92%) + 55-60%: s=183343(523939, 51.72%) r=838(1013100, 100.00%) + 60-65%: s=38922(562861, 55.56%) r=0(1013100, 100.00%) + 65-70%: s=6603(569464, 56.21%) r=0(1013100, 100.00%) + 70-75%: s=14101(583565, 57.60%) r=0(1013100, 100.00%) + 75-80%: s=85481(669046, 66.04%) r=0(1013100, 100.00%) + 80-85%: s=150723(819769, 80.92%) r=0(1013100, 100.00%) + 85-90%: s=107863(927632, 91.56%) r=0(1013100, 100.00%) + 90-95%: s=85468(1013100, 100.00%) r=0(1013100, 100.00%) + 95-100%: s=0(1013100, 100.00%) r=0(1013100, 100.00%) + +(shader clock) percentiles: +99%: ray=393223 sampling=496965 +95%: ray=321478 sampling=365558 +90%: ray=276235 sampling=356515 +75%: ray=250132 sampling=349746 +50%: ray=184418 sampling=337416 + +PROJECTED + REALTIME clock: +percentiles: +99%: ray=45580 sampling=62860 +95%: ray=30324 sampling=60136 +90%: ray=19736 sampling=58868 +75%: ray=13688 sampling=48068 +50%: ray=10784 sampling=22500 + +## Two +SOLID sampling + 0-5%: s=6(6, 0.00%) r=0(0, 0.00%) + 5-10%: s=3361(3367, 0.33%) r=0(0, 0.00%) + 10-15%: s=89901(93268, 9.21%) r=0(0, 0.00%) + 15-20%: s=681106(774374, 76.44%) r=0(0, 0.00%) + 20-25%: s=125185(899559, 88.79%) r=0(0, 0.00%) + 25-30%: s=111237(1010796, 99.77%) r=222(222, 0.02%) + 30-35%: s=2080(1012876, 99.98%) r=667(889, 0.09%) + 35-40%: s=224(1013100, 100.00%) r=621(1510, 0.15%) + 40-45%: s=0(1013100, 100.00%) r=1049(2559, 0.25%) + 45-50%: s=0(1013100, 100.00%) r=2071(4630, 0.46%) + 50-55%: s=0(1013100, 100.00%) r=1494(6124, 0.60%) + 55-60%: s=0(1013100, 100.00%) r=7099(13223, 1.31%) + 60-65%: s=0(1013100, 100.00%) r=80026(93249, 9.20%) + 65-70%: s=0(1013100, 100.00%) r=152210(245459, 24.23%) + 70-75%: s=0(1013100, 100.00%) r=466794(712253, 70.30%) + 75-80%: s=0(1013100, 100.00%) r=276878(989131, 97.63%) + 80-85%: s=0(1013100, 100.00%) r=22822(1011953, 99.89%) + 85-90%: s=0(1013100, 100.00%) r=1147(1013100, 100.00%) + 90-95%: s=0(1013100, 100.00%) r=0(1013100, 100.00%) + 95-100%: s=0(1013100, 100.00%) r=0(1013100, 100.00%) + +(shader clock) percentiles: +99%: ray=565588 sampling=105982 +95%: ray=422982 sampling=102555 +90%: ray=391055 sampling=100877 +75%: ray=350036 sampling=97007 +50%: ray=290830 sampling=76574 + +SOLID + REALTIME clock +99%: ray=97056 sampling=20328 +95%: ray=78988 sampling=19532 +90%: ray=74236 sampling=19076 +75%: ray=51580 sampling=17512 +50%: ray=29268 sampling=6648 diff --git a/ref/vk/TODO.md b/ref/vk/TODO.md index 54b9afb6e..f2d8d8229 100644 --- a/ref/vk/TODO.md +++ b/ref/vk/TODO.md @@ -1,8 +1,17 @@ +# Next +- [ ] performance query + +# 2023-12-08 E344 +- [x] measure percentage of direct light shader phases: + - [x] how long does sampling take -- ~~supposedly ALU-bound, need clockARB()?~~ + - [x] how long does ray tracing take -- ~~supposedly mem-bound, clockRealtimeEXT()?~~ +- [ ] try mapping shader realtime clock values to calibrated vk compute dispatch timestamps + Similar to what RGP draws, but not quite the same. Not sure if this is valuable, but it would be neat + # 2023-12-07 E343 - [x] extract raw shader clock -- [ ] display times as scopes somewhere -- [ ] extract sampling times -- [ ] extract ray times +- [-] display times as scopes somewhere + → tried chrome trace in ff profiler, it broke (200k scopes in 1k threads is too much) # 2023-12-05 E342 - [x] tone down the specular indirect blur diff --git a/ref/vk/shaders/light_polygon.glsl b/ref/vk/shaders/light_polygon.glsl index 728df6fe9..d4030f6eb 100644 --- a/ref/vk/shaders/light_polygon.glsl +++ b/ref/vk/shaders/light_polygon.glsl @@ -5,6 +5,10 @@ #include "noise.glsl" #include "utils.glsl" +#include "time.glsl" + +uint prof_sampling = 0, prof_ray = 0; +uint prof_lights = 0, prof_samples = 0, prof_rays = 0; #define DO_ALL_IN_CLUSTER 1 @@ -232,9 +236,14 @@ void sampleEmissiveSurfaces(vec3 P, vec3 N, vec3 throughput, vec3 view_dir, Mate const float plane_dist = dot(poly.plane, vec4(P, 1.f)); + ++prof_lights; + if (plane_dist < 0.) continue; + ++prof_samples; + + const time_t prof_sampling_begin = timeNow(); #ifdef PROJECTED const vec4 light_sample_dir = getPolygonLightSampleProjected(view_dir, ctx, poly); #elif defined(SOLID) @@ -244,6 +253,7 @@ void sampleEmissiveSurfaces(vec3 P, vec3 N, vec3 throughput, vec3 view_dir, Mate #else const vec4 light_sample_dir = getPolygonLightSampleSimple(P, view_dir, poly); #endif + prof_sampling += timeDelta(prof_sampling_begin, timeNow()); if (light_sample_dir.w <= 0.) continue; @@ -251,7 +261,12 @@ void sampleEmissiveSurfaces(vec3 P, vec3 N, vec3 throughput, vec3 view_dir, Mate const float dist = - plane_dist / dot(light_sample_dir.xyz, poly.plane.xyz); const vec3 emissive = poly.emissive; - if (!shadowed(P, light_sample_dir.xyz, dist)) { + const time_t prof_ray_begin = timeNow(); + const bool shadow = shadowed(P, light_sample_dir.xyz, dist); + prof_ray += timeDelta(prof_ray_begin, timeNow()); + ++prof_rays; + + if (!shadow) { //const float estimate = total_contrib; const float estimate = light_sample_dir.w; vec3 poly_diffuse = vec3(0.), poly_specular = vec3(0.); diff --git a/ref/vk/shaders/ray_light_direct.glsl b/ref/vk/shaders/ray_light_direct.glsl index 1c423f745..597b2655a 100644 --- a/ref/vk/shaders/ray_light_direct.glsl +++ b/ref/vk/shaders/ray_light_direct.glsl @@ -1,14 +1,6 @@ -#define PROF_USE_REALTIME -#ifdef PROF_USE_REALTIME -#extension GL_ARB_gpu_shader_int64: enable -#extension GL_EXT_shader_realtime_clock: enable -#else -#extension GL_ARB_shader_clock: enable -#endif - - #include "utils.glsl" #include "noise.glsl" +#include "time.glsl" #include "ray_kusochki.glsl" @@ -22,25 +14,8 @@ void readNormals(ivec2 uv, out vec3 geometry_normal, out vec3 shading_normal) { shading_normal = normalDecode(n.zw); } -#ifdef PROF_USE_REALTIME -// On mesa+amdgpu there's a clear gradient: pixels on top of screen take 2-3x longer to compute than bottom ones. Also, -// it does flicker a lot. -// Deltas are about 30000-100000 parrots -#define timeNow clockRealtime2x32EXT -#else -// clockARB doesn't give directly usable time values on mesa+amdgpu -// even deltas between them are not meaningful enough. -// On mesa+amdgpu clockARB() values are limited to lower 20bits, and they wrap around a lot. -// Absolute difference value are often 30-50% of the available range, so it's not that far off from wrapping around -// multiple times, rendering the value completely useless. -// Deltas are around 300000-500000 parrots. -// Other than that, the values seem uniform across the screen (as compared to realtime clock, which has a clearly -// visible gradient: top differences are larger than bottom ones. -#define timeNow clock2x32ARB -#endif - void main() { - const uvec2 time_begin = timeNow(); + const time_t time_begin = timeNow(); #ifdef RAY_TRACE const vec2 uv = (gl_LaunchIDEXT.xy + .5) / gl_LaunchSizeEXT.xy * 2. - 1.; @@ -79,19 +54,12 @@ void main() { vec3 diffuse = vec3(0.), specular = vec3(0.); computeLighting(pos + geometry_normal * .001, shading_normal, throughput, -direction, material, diffuse, specular); - const uvec2 time_end = timeNow(); + const time_t time_end = timeNow(); //const uint64_t time_diff = time_end - time_begin; //const uint time_diff = time_begin.x - time_end.x; -#ifdef PROF_USE_REALTIME - const uint64_t begin64 = time_begin.x | (uint64_t(time_begin.y) << 32); - const uint64_t end64 = time_end.x | (uint64_t(time_end.y) << 32); - const uint64_t time_diff = end64 - begin64; - const float time_diff_f = float(time_diff) / 1e5; -#else - const uint time_diff = time_begin.x - time_end.x; - const float time_diff_f = float(time_diff & 0xfffffu) / 1e6; -#endif + const uint time_diff = timeDelta(time_begin, time_end); + const float time_diff_f = float(time_diff) / 1e6; const uint prof_index = pix.x + pix.y * ubo.ubo.res.x; #if LIGHT_POINT @@ -106,5 +74,9 @@ void main() { imageStore(out_light_poly_specular, pix, vec4(specular, 0.f)); //imageStore(out_light_poly_profile, pix, profile); prof_direct_poly.a[prof_index].data[0] = uvec4(time_begin, time_end); + prof_direct_poly.a[prof_index].data[1] = uvec4(timeDelta(time_begin, time_end), prof_sampling, prof_ray, 0); + prof_direct_poly.a[prof_index].data[2] = uvec4(prof_lights, prof_samples, prof_rays, 0); + //prof_direct_poly.a[prof_index].data[2] = uvec4(prof_ray_begin, prof_ray_end); + //prof_direct_poly.a[prof_index].data[3] = uvec4(prof_light_count, 0, 0, 0); #endif } diff --git a/ref/vk/shaders/time.glsl b/ref/vk/shaders/time.glsl new file mode 100644 index 000000000..1430d5fbb --- /dev/null +++ b/ref/vk/shaders/time.glsl @@ -0,0 +1,53 @@ +#ifndef TIME_GLSL_INCLUDED +#define TIME_GLSL_INCLUDED + +//#define PROF_USE_REALTIME +#ifdef PROF_USE_REALTIME +#extension GL_ARB_gpu_shader_int64: enable +#extension GL_EXT_shader_realtime_clock: enable +#else +#extension GL_ARB_shader_clock: enable +#endif + +#define time_t uvec2 +#define T0 uvec2(0) +#define clockRealtime clockRealtime2x32EXT +#define clockShader clock2x32ARB + +#define clockRealtime64 clockRealtimeEXT +#define clockShader64 clockARB + +#ifdef PROF_USE_REALTIME +uint clockRealtimeDelta(time_t begin, time_t end) { + const uint64_t begin64 = begin.x | (uint64_t(begin.y) << 32); + const uint64_t end64 = end.x | (uint64_t(end.y) << 32); + const uint64_t time_diff = end64 - begin64; + return uint(time_diff); +} +#endif + +uint clockShaderDelta(time_t begin, time_t end) { + // AMD RNDA2 SHADER_CYCLES reg is limited to 20 bits + return (end.x - begin.x) & 0xfffffu; +} + +#ifdef PROF_USE_REALTIME +// On mesa+amdgpu there's a clear gradient: pixels on top of screen take 2-3x longer to compute than bottom ones. Also, +// it does flicker a lot. +// Deltas are about 30000-100000 parrots +#define timeNow clockRealtime +#define timeDelta clockRealtimeDelta +#else +// clockARB doesn't give directly usable time values on mesa+amdgpu +// even deltas between them are not meaningful enough. +// On AMD clockARB() values are limited to lower 20 bits (see RDNA 2 ISA SHADER_CYCLES reg), and they wrap around a lot. +// Absolute difference value are often 30-50% of the available range, so it's not that far off from wrapping around +// multiple times, rendering the value completely useless. +// Deltas are around 300000-500000 parrots. +// Other than that, the values seem uniform across the screen (as compared to realtime clock, which has a clearly +// visible gradient: top differences are larger than bottom ones. +#define timeNow clockShader +#define timeDelta clockShaderDelta +#endif + +#endif //ifndef TIME_GLSL_INCLUDED diff --git a/ref/vk/vk_rtx.c b/ref/vk/vk_rtx.c index c22e718b5..4895efbec 100644 --- a/ref/vk/vk_rtx.c +++ b/ref/vk/vk_rtx.c @@ -25,6 +25,7 @@ #include "xash3d_mathlib.h" #include +#include #define LOG_MODULE rt @@ -600,16 +601,107 @@ static void reloadMainpipe(void) { R_VkMeatpipeDestroy(newpipe); } +static int u32cmp(const void *l, const void *r) { + return *(const uint32_t*)l - *(const uint32_t*)r; +} + static void dumpProfilingData(int w, int h) { + uint64_t min = UINT64_MAX; + const struct ProfilingStruct *const prof = (const struct ProfilingStruct*)g_rtx.prof_direct_poly.mapped; + for (int i = 0; i < w * h; ++i) { + const struct ProfilingStruct *s = prof + i; + // TODO: time can wrap + const unsigned long long begin = s->data[0][0] | ((uint64_t)(s->data[0][1]) << 32); + min = Q_min(begin, min); + } + + int sampling_hist[20] = {0}; + int ray_hist[20] = {0}; + int total = 0; + + uint32_t *all_samples = Mem_Calloc(vk_core.pool, w * h * sizeof(uint32_t)); + uint32_t *all_rays = Mem_Calloc(vk_core.pool, w * h * sizeof(uint32_t)); + FILE *f = fopen("gpuprof.dump", "w"); for (int y = 0; y < h; ++y) { const struct ProfilingStruct *s = ((const struct ProfilingStruct*)(g_rtx.prof_direct_poly.mapped)) + y * g_rtx.max_frame_width; for (int x = 0; x < w; ++x, ++s) { - const unsigned long long begin = s->data[0][0] | ((uint64_t)(s->data[0][1]) << 32); - const unsigned long long end = s->data[0][2] | ((uint64_t)(s->data[0][3]) << 32); - fprintf(f, "x=%d y=%d %016llx %016llx %llu %llu d=%llu\n", x, y, begin, end, begin, end, end - begin); +#define TIME_READ(c, i) ((uint64_t)s->data[c][i*2+0] | ((uint64_t)(s->data[c][i*2+1]) << 32)) + const unsigned long long begin = TIME_READ(0, 0); + const unsigned long long end = TIME_READ(0, 1); + const uint32_t delta = end - begin; + + uint32_t shader_delta = s->data[1][0]; + const uint32_t sampling_delta = s->data[1][1]; + const uint32_t ray_delta = s->data[1][2]; + + const uint32_t lights = s->data[2][0]; + const uint32_t samples = s->data[2][1]; + const uint32_t rays = s->data[2][2]; + + while (shader_delta < sampling_delta || shader_delta < ray_delta) { + ERR("shader_delta wraparound detected, fixing"); + shader_delta += 0x100000u; + } + + /* if (shader_delta != delta) */ + /* ERR("Shader and local deltas don't match: d=%d", shader_delta - delta); */ + + if (!shader_delta) + continue; + + const float sampling_pct = sampling_delta * 100.f / shader_delta; + const float ray_pct = ray_delta * 100.f / shader_delta; + + fprintf(f, "total=%d lights=%d sampling=%d(%.02f%%; %d) ray=%d(%.02f%%; %d)\n", + shader_delta, lights, + sampling_delta, sampling_pct, samples, + ray_delta, ray_pct, rays); + + assert(sampling_pct < 100.); + sampling_hist[(int)(sampling_pct / 5.)]++; + + assert(ray_pct < 100.); + ray_hist[(int)(ray_pct / 5.)]++; + + all_rays[total] = ray_delta; + all_samples[total] = sampling_delta; + ++total; + + //fwrite(&begin, sizeof(begin), 1, f); + //fwrite(&end, sizeof(end), 1, f); + //fprintf(f, "x=%d y=%d %016llx %016llx %llu %llu d=%llu\n", x, y, begin, end, begin, end, end - begin); + //fprintf(f, "%016llx %016llx\n", begin, end); + //fprintf(f, "{\"name\":\"dpoly\",\"cat\":\"shader\",\"ph\":\"B\",\"ts\":%llu,\"tid\":%d}\n", (begin - min), tid); + //fprintf(f, "{\"name\":\"dpoly\",\"cat\":\"shader\",\"ph\":\"E\",\"ts\":%llu,\"tid\":%d}\n", (end - min), tid); } } + + for (int i = 0, s = 0, r = 0; i < 20; ++i) { + s += sampling_hist[i]; + r += ray_hist[i]; + fprintf(f, "%d-%d%%:\ts=%d(%d, %.02f%%)\tr=%d(%d, %.02f%%)\n", + i*5, i*5+5, + sampling_hist[i], s, s * 100.f / total, + ray_hist[i], r, r * 100.f / total); + } + + qsort(all_rays, total, sizeof(all_rays[0]), u32cmp); + qsort(all_samples, total, sizeof(all_samples[0]), u32cmp); + + fprintf(f, "percentiles:\n"); + static const int pcts[] = {99, 95, 90, 75, 50}; + for (int i = 0; i < COUNTOF(pcts); ++i) { + const int pct = pcts[i]; + const int index = total * pct / 100; + fprintf(f, "%d%%: ray=%d sampling=%d\n", pct, + all_rays[index], all_samples[index] + ); + } + + Mem_Free(all_rays); + Mem_Free(all_samples); + fclose(f); }