From 1937105ec37952047bb6f0baa4ae8f251c5abf6f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?John=20K=C3=A5re=20Alsaker?= <john.kare.alsaker@gmail.com>
Date: Wed, 11 Oct 2023 16:39:47 +0200
Subject: [PATCH] Get the `fine` CPU shader running

---
 shader/writeback.wgsl  |  25 +++
 src/cpu_dispatch.rs    |  25 ++-
 src/cpu_shader/fine.rs |  29 +++-
 src/cpu_shader/mod.rs  |   1 +
 src/engine.rs          |  22 ++-
 src/lib.rs             |  14 +-
 src/render.rs          |  25 ++-
 src/shaders.rs         |  33 ++--
 src/wgpu_engine.rs     | 386 +++++++++++++++++++++++++++++------------
 9 files changed, 415 insertions(+), 145 deletions(-)
 create mode 100644 shader/writeback.wgsl
diff --git a/shader/writeback.wgsl b/shader/writeback.wgsl
new file mode 100644
index 00000000..8a61bcf2
--- /dev/null
+++ b/shader/writeback.wgsl
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// Writes an array to a texture.
+
+#import config
+
+@group(0) @binding(0)
+var<uniform> config: Config;
+
+@group(0) @binding(1)
+var<storage> source: array<u32>;
+
+@group(0) @binding(2)
+var output: texture_storage_2d<rgba8unorm, write>;
+
+@compute @workgroup_size(1)
+fn main(
+    @builtin(global_invocation_id) global_id: vec3<u32>,
+    @builtin(local_invocation_id) local_id: vec3<u32>,
+    @builtin(workgroup_id) wg_id: vec3<u32>,
+) {
+    let row = global_id.y * config.target_width;
+    let pixel = source[row + global_id.x];
+    textureStore(output, vec2<i32>(global_id.xy), unpack4x8unorm(pixel));
+}
diff --git a/src/cpu_dispatch.rs b/src/cpu_dispatch.rs
index 01c28c31..44b776ff 100644
--- a/src/cpu_dispatch.rs
+++ b/src/cpu_dispatch.rs
@@ -10,12 +10,14 @@ use std::{
 
 use bytemuck::Pod;
 
+use crate::ImageProxy;
+
 #[derive(Clone, Copy)]
 pub enum CpuBinding<'a> {
     Buffer(&'a [u8]),
     BufferRW(&'a RefCell<Vec<u8>>),
-    #[allow(unused)]
-    Texture(&'a CpuTexture),
+    Texture(&'a [u8]),
+    TextureRW(&'a RefCell<CpuTexture>),
 }
 
 pub enum TypedBufGuard<'a, T: ?Sized> {
@@ -109,7 +111,14 @@ impl<'a> CpuBinding<'a> {
     #[allow(unused)]
     pub fn as_tex(&self) -> &CpuTexture {
         match self {
-            CpuBinding::Texture(t) => t,
+            CpuBinding::Texture(t) => todo!(),
+            _ => panic!("resource type mismatch"),
+        }
+    }
+
+    pub fn as_tex_mut(&self) -> RefMut<CpuTexture> {
+        match self {
+            CpuBinding::TextureRW(t) => t.borrow_mut(),
             _ => panic!("resource type mismatch"),
         }
     }
@@ -122,3 +131,13 @@ pub struct CpuTexture {
     // In RGBA format. May expand in the future.
     pub pixels: Vec<u32>,
 }
+
+impl CpuTexture {
+    pub fn new(img: &ImageProxy) -> Self {
+        CpuTexture {
+            width: img.width as usize,
+            height: img.height as usize,
+            pixels: vec![0; img.width as usize * img.height as usize],
+        }
+    }
+}
diff --git a/src/cpu_shader/fine.rs b/src/cpu_shader/fine.rs
index c64c8762..8267e5ea 100644
--- a/src/cpu_shader/fine.rs
+++ b/src/cpu_shader/fine.rs
@@ -1,9 +1,9 @@
 // Copyright 2023 The Vello authors
 // SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
 
-use vello_encoding::{ConfigUniform, PathSegment, Tile};
+use vello_encoding::{ConfigUniform, PathSegment};
 
-use crate::cpu_dispatch::CpuTexture;
+use crate::cpu_dispatch::{CpuBinding, CpuTexture};
 
 use super::{CMD_COLOR, CMD_END, CMD_FILL, CMD_JUMP, CMD_SOLID, PTCL_INITIAL_ALLOC};
 
@@ -111,7 +111,6 @@ fn fill_path(area: &mut [f32], segments: &[PathSegment], fill: &CmdFill, x_tile:
 #[allow(unused)]
 fn fine_main(
     config: &ConfigUniform,
-    tiles: &[Tile],
     segments: &[PathSegment],
     output: &mut CpuTexture,
     ptcl: &[u32],
@@ -154,7 +153,7 @@ fn fine_main(
                     for a in &mut area {
                         *a = 1.0;
                     }
-                    cmd_ix += 2;
+                    cmd_ix += 1;
                 }
                 CMD_COLOR => {
                     let color = read_color(ptcl, cmd_ix);
@@ -177,12 +176,28 @@ fn fine_main(
         }
         // Write tile (in rgba)
         for y in 0..TILE_HEIGHT {
-            let base =
-                output.width * (tile_y as usize * TILE_HEIGHT + y) + tile_x as usize * TILE_WIDTH;
+            let base = config.target_width as usize * (tile_y as usize * TILE_HEIGHT + y)
+                + tile_x as usize * TILE_WIDTH;
             for x in 0..TILE_WIDTH {
                 let rgba32 = pack4x8unorm(rgba[y * TILE_WIDTH + x]);
-                output.pixels[base + x] = rgba32;
+                // TODO: Fix out of bounds
+                //output.pixels[base + x] = rgba32;
+                if let Some(p) = output.pixels.get_mut(base + x) {
+                    *p = rgba32;
+                }
             }
         }
     }
 }
+
+pub fn fine(_n_wg: u32, resources: &[CpuBinding]) {
+    let config = resources[0].as_typed();
+    let segments = resources[1].as_slice();
+    let ptcl = resources[2].as_slice();
+    let info = resources[3].as_slice();
+    let mut output = resources[4].as_tex_mut();
+    //let gradients = resources[4].as_tex();
+    //let image_atlas = resources[5].as_tex();
+
+    fine_main(&config, &segments, &mut output, &ptcl, &info);
+}
diff --git a/src/cpu_shader/mod.rs b/src/cpu_shader/mod.rs
index 16d261f6..9bb19946 100644
--- a/src/cpu_shader/mod.rs
+++ b/src/cpu_shader/mod.rs
@@ -34,6 +34,7 @@ pub use clip_reduce::clip_reduce;
 pub use coarse::coarse;
 pub use draw_leaf::draw_leaf;
 pub use draw_reduce::draw_reduce;
+pub use fine::fine;
 pub use flatten::flatten;
 pub use path_count::path_count;
 pub use path_count_setup::path_count_setup;
diff --git a/src/engine.rs b/src/engine.rs
index a122e353..d108c8dd 100644
--- a/src/engine.rs
+++ b/src/engine.rs
@@ -48,12 +48,20 @@ pub enum ImageFormat {
     Bgra8,
 }
 
+#[derive(Clone, Copy, PartialEq)]
+pub enum ImageAccess {
+    Read,
+    Full,
+    WriteOnce,
+}
+
 #[derive(Clone, Copy)]
 pub struct ImageProxy {
     pub width: u32,
     pub height: u32,
     pub format: ImageFormat,
     pub id: Id,
+    pub access: ImageAccess,
 }
 
 #[derive(Clone, Copy)]
@@ -73,6 +81,11 @@ pub enum Command {
     Dispatch(ShaderId, (u32, u32, u32), Vec<ResourceProxy>),
     DispatchIndirect(ShaderId, BufProxy, u64, Vec<ResourceProxy>),
     Download(BufProxy),
+    Writeback {
+        image: ImageProxy,
+        shader: ShaderId,
+        config: ResourceProxy,
+    },
     Clear(BufProxy, u64, Option<NonZeroU64>),
     FreeBuf(BufProxy),
     FreeImage(ImageProxy),
@@ -121,7 +134,7 @@ impl Recording {
         data: impl Into<Vec<u8>>,
     ) -> ImageProxy {
         let data = data.into();
-        let image_proxy = ImageProxy::new(width, height, format);
+        let image_proxy = ImageProxy::new(width, height, format, ImageAccess::Read);
         self.push(Command::UploadImage(image_proxy, data));
         image_proxy
     }
@@ -219,13 +232,14 @@ impl ImageFormat {
 }
 
 impl ImageProxy {
-    pub fn new(width: u32, height: u32, format: ImageFormat) -> Self {
+    pub fn new(width: u32, height: u32, format: ImageFormat, access: ImageAccess) -> Self {
         let id = Id::next();
         ImageProxy {
             width,
             height,
             format,
             id,
+            access,
         }
     }
 }
@@ -235,8 +249,8 @@ impl ResourceProxy {
         Self::Buf(BufProxy::new(size, name))
     }
 
-    pub fn new_image(width: u32, height: u32, format: ImageFormat) -> Self {
-        Self::Image(ImageProxy::new(width, height, format))
+    pub fn new_image(width: u32, height: u32, format: ImageFormat, access: ImageAccess) -> Self {
+        Self::Image(ImageProxy::new(width, height, format, access))
     }
 
     pub fn as_buf(&self) -> Option<&BufProxy> {
diff --git a/src/lib.rs b/src/lib.rs
index 053f30af..2f14124b 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -47,7 +47,7 @@ pub use engine::{
 };
 pub use shaders::FullShaders;
 #[cfg(feature = "wgpu")]
-use wgpu_engine::{ExternalResource, WgpuEngine};
+use wgpu_engine::{ExternalResource, TransientBindMap, WgpuEngine};
 
 /// Temporary export, used in with_winit for stats
 pub use vello_encoding::BumpAllocators;
@@ -73,7 +73,7 @@ enum AaConfig {
 
 /// Configuration of antialiasing. Currently this is static, but could be switched to
 /// a launch option or even finer-grained.
-const ANTIALIASING: AaConfig = AaConfig::Msaa16;
+const ANTIALIASING: AaConfig = AaConfig::Area;
 
 /// Renders a scene into a texture or surface.
 #[cfg(feature = "wgpu")]
@@ -154,12 +154,13 @@ impl Renderer {
             *target.as_image().unwrap(),
             texture,
         )];
+        let mut transient_map = TransientBindMap::new(&external_resources);
         self.engine.run_recording(
             device,
             queue,
             &recording,
-            &external_resources,
             "render_to_texture",
+            &mut transient_map,
             #[cfg(feature = "wgpu-profiler")]
             &mut self.profiler,
         )?;
@@ -274,12 +275,14 @@ impl Renderer {
         let recording = render.render_encoding_coarse(encoding, &self.shaders, params, robust);
         let target = render.out_image();
         let bump_buf = render.bump_buf();
+        let external_resources = [ExternalResource::Image(target, texture)];
+        let mut transient_map = TransientBindMap::new(&external_resources);
         self.engine.run_recording(
             device,
             queue,
             &recording,
-            &[],
             "t_async_coarse",
+            &mut transient_map,
             #[cfg(feature = "wgpu-profiler")]
             &mut self.profiler,
         )?;
@@ -303,13 +306,12 @@ impl Renderer {
         // Maybe clear to reuse allocation?
         let mut recording = Recording::default();
         render.record_fine(&self.shaders, &mut recording);
-        let external_resources = [ExternalResource::Image(target, texture)];
         self.engine.run_recording(
             device,
             queue,
             &recording,
-            &external_resources,
             "t_async_fine",
+            &mut transient_map,
             #[cfg(feature = "wgpu-profiler")]
             &mut self.profiler,
         )?;
diff --git a/src/render.rs b/src/render.rs
index 0bb65795..f299a227 100644
--- a/src/render.rs
+++ b/src/render.rs
@@ -1,7 +1,7 @@
 //! Take an encoded scene and create a graph to render it
 
 use crate::{
-    engine::{BufProxy, ImageFormat, ImageProxy, Recording, ResourceProxy},
+    engine::{BufProxy, ImageAccess, ImageFormat, ImageProxy, Recording, ResourceProxy},
     shaders::FullShaders,
     AaConfig, RenderParams, Scene, ANTIALIASING,
 };
@@ -85,7 +85,7 @@ impl Render {
         let mut packed = vec![];
         let (layout, ramps, images) = resolver.resolve(encoding, &mut packed);
         let gradient_image = if ramps.height == 0 {
-            ResourceProxy::new_image(1, 1, ImageFormat::Rgba8)
+            ResourceProxy::new_image(1, 1, ImageFormat::Rgba8, ImageAccess::Full)
         } else {
             let data: &[u8] = bytemuck::cast_slice(ramps.data);
             ResourceProxy::Image(recording.upload_image(
@@ -96,9 +96,14 @@ impl Render {
             ))
         };
         let image_atlas = if images.images.is_empty() {
-            ImageProxy::new(1, 1, ImageFormat::Rgba8)
+            ImageProxy::new(1, 1, ImageFormat::Rgba8, ImageAccess::Full)
         } else {
-            ImageProxy::new(images.width, images.height, ImageFormat::Rgba8)
+            ImageProxy::new(
+                images.width,
+                images.height,
+                ImageFormat::Rgba8,
+                ImageAccess::Full,
+            )
         };
         for image in images.images {
             recording.write_image(
@@ -390,7 +395,12 @@ impl Render {
         recording.free_resource(draw_monoid_buf);
         recording.free_resource(bin_header_buf);
         recording.free_resource(path_buf);
-        let out_image = ImageProxy::new(params.width, params.height, ImageFormat::Rgba8);
+        let out_image = ImageProxy::new(
+            params.width,
+            params.height,
+            ImageFormat::Rgba8,
+            ImageAccess::WriteOnce,
+        );
         self.fine_wg_count = Some(wg_counts.fine);
         self.fine_resources = Some(FineResources {
             config_buf,
@@ -456,6 +466,11 @@ impl Render {
                 );
             }
         }
+        recording.push(crate::Command::Writeback {
+            image: fine.out_image,
+            config: fine.config_buf,
+            shader: shaders.writeback,
+        });
         recording.free_resource(fine.config_buf);
         recording.free_resource(fine.tile_buf);
         recording.free_resource(fine.segments_buf);
diff --git a/src/shaders.rs b/src/shaders.rs
index de23e077..9d1777d5 100644
--- a/src/shaders.rs
+++ b/src/shaders.rs
@@ -18,19 +18,18 @@
 
 mod preprocess;
 
-use std::collections::HashSet;
-
+use crate::engine::ShaderId;
 #[cfg(feature = "wgpu")]
-use wgpu::Device;
-
-use crate::{
-    cpu_shader,
-    engine::{BindType, Error, ImageFormat, ShaderId},
+use {
+    crate::wgpu_engine::WgpuEngine,
+    crate::{
+        cpu_shader,
+        engine::{BindType, Error, ImageFormat},
+    },
+    std::collections::HashSet,
+    wgpu::Device,
 };
 
-#[cfg(feature = "wgpu")]
-use crate::wgpu_engine::WgpuEngine;
-
 macro_rules! shader {
     ($name:expr) => {&{
         let shader = include_str!(concat!(
@@ -79,6 +78,7 @@ pub struct FullShaders {
     pub path_tiling_setup: ShaderId,
     pub path_tiling: ShaderId,
     pub fine: ShaderId,
+    pub writeback: ShaderId,
     // 2-level dispatch works for CPU pathtag scan even for large
     // inputs, 3-level is not yet implemented.
     pub pathtag_is_cpu: bool,
@@ -114,12 +114,16 @@ pub fn full_shaders(device: &Device, engine: &mut WgpuEngine) -> Result<FullShad
 
     let mut force_gpu = false;
 
+    #[allow(unused_variables)]
     let force_gpu_from: Option<&str> = None;
 
     // Uncomment this to force use of GPU shaders from the specified shader and later even
     // if `engine.use_cpu` is specified.
     //let force_gpu_from = Some("binning");
 
+    // Use the GPU for the fine shader for now as the CPU shader is incomplete.
+    let force_gpu_from = Some("fine");
+
     macro_rules! add_shader {
         ($name:ident, $bindings:expr, $defines:expr, $cpu:expr) => {{
             if force_gpu_from == Some(stringify!($name)) {
@@ -282,7 +286,7 @@ pub fn full_shaders(device: &Device, engine: &mut WgpuEngine) -> Result<FullShad
                 ImageRead(ImageFormat::Rgba8),
             ],
             &full_config,
-            CpuShaderType::Missing
+            CpuShaderType::Present(cpu_shader::fine)
         ),
         _ => add_shader!(
             fine,
@@ -300,6 +304,12 @@ pub fn full_shaders(device: &Device, engine: &mut WgpuEngine) -> Result<FullShad
             CpuShaderType::Missing
         ),
     };
+    let writeback = add_shader!(
+        writeback,
+        [Uniform, BufReadOnly, Image(ImageFormat::Rgba8)],
+        &empty,
+        CpuShaderType::Missing
+    );
     Ok(FullShaders {
         pathtag_reduce,
         pathtag_reduce2,
@@ -321,6 +331,7 @@ pub fn full_shaders(device: &Device, engine: &mut WgpuEngine) -> Result<FullShad
         path_tiling_setup,
         path_tiling,
         fine,
+        writeback,
         pathtag_is_cpu: engine.use_cpu,
     })
 }
diff --git a/src/wgpu_engine.rs b/src/wgpu_engine.rs
index 10153521..1f6fca7a 100644
--- a/src/wgpu_engine.rs
+++ b/src/wgpu_engine.rs
@@ -14,8 +14,8 @@ use wgpu::{
 };
 
 use crate::{
-    cpu_dispatch::CpuBinding,
-    engine::{BindType, Error},
+    cpu_dispatch::{CpuBinding, CpuTexture},
+    engine::{BindType, Error, ImageAccess},
     BufProxy, Command, Id, ImageProxy, Recording, ResourceProxy, ShaderId,
 };
 
@@ -85,10 +85,16 @@ struct BindMapBuffer {
     label: &'static str,
 }
 
+/// A texture can exist either on the GPU or on CPU.
+enum MaterializedImage {
+    Gpu((Texture, TextureView)),
+    Cpu(RefCell<CpuTexture>),
+}
+
 #[derive(Default)]
 struct BindMap {
     buf_map: HashMap<Id, BindMapBuffer>,
-    image_map: HashMap<Id, (Texture, TextureView)>,
+    image_map: HashMap<Id, MaterializedImage>,
     pending_clears: HashSet<Id>,
 }
 
@@ -111,10 +117,9 @@ struct ResourcePool {
 /// `run_recording()`, including external resources and also buffer
 /// uploads.
 #[derive(Default)]
-struct TransientBindMap<'a> {
+pub struct TransientBindMap<'a> {
     bufs: HashMap<Id, TransientBuf<'a>>,
-    // TODO: create transient image type
-    images: HashMap<Id, &'a TextureView>,
+    images: HashMap<Id, TransientImage<'a>>,
 }
 
 enum TransientBuf<'a> {
@@ -122,6 +127,15 @@ enum TransientBuf<'a> {
     Gpu(&'a Buffer),
 }
 
+enum TransientImage<'a> {
+    Cpu(&'a [u8]),
+    CpuOwned {
+        data: RefCell<CpuTexture>,
+        view: &'a TextureView,
+    },
+    Gpu(&'a TextureView),
+}
+
 impl WgpuEngine {
     pub fn new(use_cpu: bool) -> WgpuEngine {
         Self {
@@ -251,18 +265,17 @@ impl WgpuEngine {
         })
     }
 
-    pub fn run_recording(
+    pub fn run_recording<'a>(
         &mut self,
         device: &Device,
         queue: &Queue,
-        recording: &Recording,
-        external_resources: &[ExternalResource],
+        recording: &'a Recording,
         label: &'static str,
+        transient_map: &mut TransientBindMap<'a>,
         #[cfg(feature = "wgpu-profiler")] profiler: &mut wgpu_profiler::GpuProfiler,
     ) -> Result<(), Error> {
         let mut free_bufs: HashSet<Id> = Default::default();
         let mut free_images: HashSet<Id> = Default::default();
-        let mut transient_map = TransientBindMap::new(external_resources);
 
         let mut encoder =
             device.create_command_encoder(&CommandEncoderDescriptor { label: Some(label) });
@@ -297,6 +310,9 @@ impl WgpuEngine {
                     self.bind_map.insert_buf(buf_proxy, buf);
                 }
                 Command::UploadImage(image_proxy, bytes) => {
+                    transient_map
+                        .images
+                        .insert(image_proxy.id, TransientImage::Cpu(bytes));
                     let format = image_proxy.format.to_wgpu();
                     let block_size = format
                         .block_size(None)
@@ -347,31 +363,95 @@ impl WgpuEngine {
                     self.bind_map
                         .insert_image(image_proxy.id, texture, texture_view)
                 }
-                Command::WriteImage(proxy, [x, y, width, height], data) => {
-                    if let Ok((texture, _)) = self.bind_map.get_or_create_image(*proxy, device) {
-                        let format = proxy.format.to_wgpu();
-                        let block_size = format
-                            .block_size(None)
-                            .expect("ImageFormat must have a valid block size");
-                        queue.write_texture(
-                            wgpu::ImageCopyTexture {
-                                texture,
-                                mip_level: 0,
-                                origin: wgpu::Origin3d { x: *x, y: *y, z: 0 },
-                                aspect: TextureAspect::All,
-                            },
-                            &data[..],
-                            wgpu::ImageDataLayout {
-                                offset: 0,
-                                bytes_per_row: Some(*width * block_size),
-                                rows_per_image: None,
+                Command::Writeback {
+                    image,
+                    shader,
+                    config,
+                } => {
+                    transient_map.prepare_proxy(
+                        &mut self.bind_map,
+                        &mut self.pool,
+                        device,
+                        queue,
+                        &mut encoder,
+                        config,
+                    );
+
+                    if let Some(TransientImage::CpuOwned { data, view }) =
+                        transient_map.images.get(&image.id)
+                    {
+                        let data = &*data.borrow();
+
+                        let wgpu_shader =
+                            if let ShaderKind::Wgpu(shader) = self.shaders[shader.0].select() {
+                                shader
+                            } else {
+                                panic!("expected GPU shader")
+                            };
+                        let buf = device.create_buffer(&wgpu::BufferDescriptor {
+                            label: None,
+                            size: data.pixels.len() as u64 * 4,
+                            usage: BufferUsages::STORAGE | BufferUsages::COPY_DST,
+                            mapped_at_creation: false,
+                        });
+                        queue.write_buffer(&buf, 0, bytemuck::cast_slice(&data.pixels));
+
+                        let entries = [
+                            transient_map.create_bind_entry(&self.bind_map, 0, config),
+                            wgpu::BindGroupEntry {
+                                binding: 1,
+                                resource: buf.as_entire_binding(),
                             },
-                            wgpu::Extent3d {
-                                width: *width,
-                                height: *height,
-                                depth_or_array_layers: 1,
+                            wgpu::BindGroupEntry {
+                                binding: 2,
+                                resource: wgpu::BindingResource::TextureView(view),
                             },
-                        );
+                        ];
+                        let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
+                            label: None,
+                            layout: &wgpu_shader.bind_group_layout,
+                            entries: &entries,
+                        });
+                        let mut cpass = encoder.begin_compute_pass(&Default::default());
+                        #[cfg(feature = "wgpu-profiler")]
+                        profiler.begin_scope("writeback", &mut cpass, device);
+                        cpass.set_pipeline(&wgpu_shader.pipeline);
+                        cpass.set_bind_group(0, &bind_group, &[]);
+                        cpass.dispatch_workgroups(data.width as u32, data.height as u32, 1);
+                        #[cfg(feature = "wgpu-profiler")]
+                        profiler.end_scope(&mut cpass);
+                    }
+                }
+                Command::WriteImage(proxy, [x, y, width, height], data) => {
+                    if let Ok(image) = self.bind_map.get_or_create_image(*proxy, device) {
+                        match image {
+                            MaterializedImage::Gpu((texture, _)) => {
+                                let format = proxy.format.to_wgpu();
+                                let block_size = format
+                                    .block_size(None)
+                                    .expect("ImageFormat must have a valid block size");
+                                queue.write_texture(
+                                    wgpu::ImageCopyTexture {
+                                        texture,
+                                        mip_level: 0,
+                                        origin: wgpu::Origin3d { x: *x, y: *y, z: 0 },
+                                        aspect: TextureAspect::All,
+                                    },
+                                    &data[..],
+                                    wgpu::ImageDataLayout {
+                                        offset: 0,
+                                        bytes_per_row: Some(*width * block_size),
+                                        rows_per_image: None,
+                                    },
+                                    wgpu::Extent3d {
+                                        width: *width,
+                                        height: *height,
+                                        depth_or_array_layers: 1,
+                                    },
+                                );
+                            }
+                            MaterializedImage::Cpu(..) => todo!(),
+                        }
                     }
                 }
                 Command::Dispatch(shader_id, wg_size, bindings) => {
@@ -512,10 +592,9 @@ impl WgpuEngine {
             }
         }
         for id in free_images {
-            if let Some((texture, view)) = self.bind_map.image_map.remove(&id) {
+            if let Some(image) = self.bind_map.image_map.remove(&id) {
                 // TODO: have a pool to avoid needless re-allocation
-                drop(texture);
-                drop(view);
+                drop(image);
             }
         }
         Ok(())
@@ -549,6 +628,14 @@ impl BindMap {
         })
     }
 
+    /// Get a image, only if it's on GPU.
+    fn get_gpu_image(&self, id: Id) -> Option<&(Texture, TextureView)> {
+        self.image_map.get(&id).and_then(|b| match &b {
+            MaterializedImage::Gpu(b) => Some(b),
+            _ => None,
+        })
+    }
+
     /// Get a CPU buffer.
     ///
     /// Panics if buffer is not present or is on GPU.
@@ -559,6 +646,16 @@ impl BindMap {
         }
     }
 
+    /// Get a CPU image.
+    ///
+    /// Panics if image is not present or is on GPU.
+    fn get_cpu_image(&self, id: Id) -> CpuBinding {
+        match &self.image_map[&id] {
+            MaterializedImage::Cpu(b) => CpuBinding::TextureRW(b),
+            _ => panic!("getting cpu image, but it's on gpu"),
+        }
+    }
+
     fn materialize_cpu_buf(&mut self, buf: &BufProxy) {
         self.buf_map.entry(buf.id).or_insert_with(|| {
             let buffer = MaterializedBuffer::Cpu(RefCell::new(vec![0; buf.size as usize]));
@@ -570,8 +667,15 @@ impl BindMap {
         });
     }
 
+    fn materialize_cpu_image(&mut self, img: &ImageProxy) {
+        self.image_map
+            .entry(img.id)
+            .or_insert_with(|| MaterializedImage::Cpu(RefCell::new(CpuTexture::new(img))));
+    }
+
     fn insert_image(&mut self, id: Id, image: Texture, image_view: TextureView) {
-        self.image_map.insert(id, (image, image_view));
+        self.image_map
+            .insert(id, MaterializedImage::Gpu((image, image_view)));
     }
 
     fn get_buf(&mut self, proxy: BufProxy) -> Option<&BindMapBuffer> {
@@ -582,7 +686,7 @@ impl BindMap {
         &mut self,
         proxy: ImageProxy,
         device: &Device,
-    ) -> Result<&(Texture, TextureView), Error> {
+    ) -> Result<&MaterializedImage, Error> {
         match self.image_map.entry(proxy.id) {
             Entry::Occupied(occupied) => Ok(occupied.into_mut()),
             Entry::Vacant(vacant) => {
@@ -611,7 +715,7 @@ impl BindMap {
                     array_layer_count: None,
                     format: Some(proxy.format.to_wgpu()),
                 });
-                Ok(vacant.insert((texture, texture_view)))
+                Ok(vacant.insert(MaterializedImage::Gpu((texture, texture_view))))
             }
         }
     }
@@ -688,9 +792,28 @@ impl BindMapBuffer {
     }
 }
 
+impl MaterializedImage {
+    // Upload a image from CPU to GPU if needed.
+    //
+    // Note data flow is one way only, from CPU to GPU. Once this method is
+    // called, the image is no longer materialized on CPU, and cannot be
+    // accessed from a CPU shader.
+    fn upload_if_needed(
+        &mut self,
+        _proxy: &ImageProxy,
+        _device: &Device,
+        _queue: &Queue,
+        _pool: &mut ResourcePool,
+    ) {
+        if let MaterializedImage::Cpu(_cpu_buf) = &self {
+            todo!()
+        }
+    }
+}
+
 impl<'a> TransientBindMap<'a> {
     /// Create new transient bind map, seeded from external resources
-    fn new(external_resources: &'a [ExternalResource]) -> Self {
+    pub fn new(external_resources: &'a [ExternalResource]) -> Self {
         let mut bufs = HashMap::default();
         let mut images = HashMap::default();
         for resource in external_resources {
@@ -698,8 +821,8 @@ impl<'a> TransientBindMap<'a> {
                 ExternalResource::Buf(proxy, gpu_buf) => {
                     bufs.insert(proxy.id, TransientBuf::Gpu(gpu_buf));
                 }
-                ExternalResource::Image(proxy, gpu_image) => {
-                    images.insert(proxy.id, *gpu_image);
+                ExternalResource::Image(proxy, view) => {
+                    images.insert(proxy.id, TransientImage::Gpu(view));
                 }
             }
         }
@@ -721,49 +844,47 @@ impl<'a> TransientBindMap<'a> {
         }
     }
 
-    #[allow(clippy::too_many_arguments)]
-    fn create_bind_group(
+    fn prepare_proxy(
         &mut self,
         bind_map: &mut BindMap,
         pool: &mut ResourcePool,
         device: &Device,
         queue: &Queue,
         encoder: &mut CommandEncoder,
-        layout: &BindGroupLayout,
-        bindings: &[ResourceProxy],
-    ) -> Result<BindGroup, Error> {
-        for proxy in bindings {
-            match proxy {
-                ResourceProxy::Buf(proxy) => {
-                    if self.bufs.contains_key(&proxy.id) {
-                        continue;
-                    }
-                    match bind_map.buf_map.entry(proxy.id) {
-                        Entry::Vacant(v) => {
-                            // TODO: only some buffers will need indirect, but does it hurt?
-                            let usage = BufferUsages::COPY_SRC
-                                | BufferUsages::COPY_DST
-                                | BufferUsages::STORAGE
-                                | BufferUsages::INDIRECT;
-                            let buf = pool.get_buf(proxy.size, proxy.name, usage, device);
-                            if bind_map.pending_clears.remove(&proxy.id) {
-                                encoder.clear_buffer(&buf, 0, None);
-                            }
-                            v.insert(BindMapBuffer {
-                                buffer: MaterializedBuffer::Gpu(buf),
-                                label: proxy.name,
-                            });
-                        }
-                        Entry::Occupied(mut o) => {
-                            o.get_mut().upload_if_needed(proxy, device, queue, pool)
+        proxy: &ResourceProxy,
+    ) {
+        match proxy {
+            ResourceProxy::Buf(proxy) => {
+                if self.bufs.contains_key(&proxy.id) {
+                    return;
+                }
+                match bind_map.buf_map.entry(proxy.id) {
+                    Entry::Vacant(v) => {
+                        // TODO: only some buffers will need indirect, but does it hurt?
+                        let usage = BufferUsages::COPY_SRC
+                            | BufferUsages::COPY_DST
+                            | BufferUsages::STORAGE
+                            | BufferUsages::INDIRECT;
+                        let buf = pool.get_buf(proxy.size, proxy.name, usage, device);
+                        if bind_map.pending_clears.remove(&proxy.id) {
+                            encoder.clear_buffer(&buf, 0, None);
                         }
+                        v.insert(BindMapBuffer {
+                            buffer: MaterializedBuffer::Gpu(buf),
+                            label: proxy.name,
+                        });
                     }
-                }
-                ResourceProxy::Image(proxy) => {
-                    if self.images.contains_key(&proxy.id) {
-                        continue;
+                    Entry::Occupied(mut o) => {
+                        o.get_mut().upload_if_needed(proxy, device, queue, pool)
                     }
-                    if let Entry::Vacant(v) = bind_map.image_map.entry(proxy.id) {
+                }
+            }
+            ResourceProxy::Image(proxy) => {
+                if self.images.contains_key(&proxy.id) {
+                    return;
+                }
+                match bind_map.image_map.entry(proxy.id) {
+                    Entry::Vacant(v) => {
                         let format = proxy.format.to_wgpu();
                         let texture = device.create_texture(&wgpu::TextureDescriptor {
                             label: None,
@@ -789,39 +910,67 @@ impl<'a> TransientBindMap<'a> {
                             array_layer_count: None,
                             format: Some(proxy.format.to_wgpu()),
                         });
-                        v.insert((texture, texture_view));
+                        v.insert(MaterializedImage::Gpu((texture, texture_view)));
+                    }
+                    Entry::Occupied(mut o) => {
+                        o.get_mut().upload_if_needed(proxy, device, queue, pool)
                     }
                 }
             }
         }
-        let entries = bindings
-            .iter()
-            .enumerate()
-            .map(|(i, proxy)| match proxy {
-                ResourceProxy::Buf(proxy) => {
-                    let buf = match self.bufs.get(&proxy.id) {
-                        Some(TransientBuf::Gpu(b)) => b,
-                        _ => bind_map.get_gpu_buf(proxy.id).unwrap(),
-                    };
-                    Ok(wgpu::BindGroupEntry {
-                        binding: i as u32,
-                        resource: buf.as_entire_binding(),
-                    })
+    }
+
+    fn create_bind_entry<'b>(
+        &'b self,
+        bind_map: &'b BindMap,
+        binding: u32,
+        proxy: &ResourceProxy,
+    ) -> wgpu::BindGroupEntry<'b> {
+        match proxy {
+            ResourceProxy::Buf(proxy) => {
+                let buf = match self.bufs.get(&proxy.id) {
+                    Some(TransientBuf::Gpu(b)) => b,
+                    _ => bind_map.get_gpu_buf(proxy.id).unwrap(),
+                };
+                wgpu::BindGroupEntry {
+                    binding,
+                    resource: buf.as_entire_binding(),
                 }
-                ResourceProxy::Image(proxy) => {
-                    let view = self
-                        .images
-                        .get(&proxy.id)
-                        .copied()
-                        .or_else(|| bind_map.image_map.get(&proxy.id).map(|v| &v.1))
-                        .unwrap();
-                    Ok(wgpu::BindGroupEntry {
-                        binding: i as u32,
-                        resource: wgpu::BindingResource::TextureView(view),
-                    })
+            }
+            ResourceProxy::Image(proxy) => {
+                let view = match self.images.get(&proxy.id) {
+                    Some(TransientImage::Gpu(view)) => *view,
+                    _ => &bind_map.get_gpu_image(proxy.id).unwrap().1,
+                };
+                wgpu::BindGroupEntry {
+                    binding,
+                    resource: wgpu::BindingResource::TextureView(view),
                 }
-            })
-            .collect::<Result<Vec<_>, Error>>()?;
+            }
+        }
+    }
+
+    #[allow(clippy::too_many_arguments)]
+    fn create_bind_group(
+        &mut self,
+        bind_map: &mut BindMap,
+        pool: &mut ResourcePool,
+        device: &Device,
+        queue: &Queue,
+        encoder: &mut CommandEncoder,
+        layout: &BindGroupLayout,
+        bindings: &[ResourceProxy],
+    ) -> Result<BindGroup, Error> {
+        for proxy in bindings {
+            self.prepare_proxy(bind_map, pool, device, queue, encoder, proxy);
+        }
+
+        let bind_map = &mut *bind_map;
+        let entries: Vec<_> = bindings
+            .iter()
+            .enumerate()
+            .map(|(i, proxy)| self.create_bind_entry(bind_map, i as u32, proxy))
+            .collect();
         let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
             label: None,
             layout,
@@ -830,11 +979,11 @@ impl<'a> TransientBindMap<'a> {
         Ok(bind_group)
     }
 
-    fn create_cpu_resources(
-        &self,
-        bind_map: &'a mut BindMap,
+    fn create_cpu_resources<'b>(
+        &'b mut self,
+        bind_map: &'b mut BindMap,
         bindings: &[ResourceProxy],
-    ) -> Vec<CpuBinding> {
+    ) -> Vec<CpuBinding<'b>> {
         // First pass is mutable; create buffers as needed
         for resource in bindings {
             match resource {
@@ -843,7 +992,22 @@ impl<'a> TransientBindMap<'a> {
                     Some(TransientBuf::Gpu(_)) => panic!("buffer was already materialized on GPU"),
                     _ => bind_map.materialize_cpu_buf(buf),
                 },
-                ResourceProxy::Image(_) => todo!(),
+                ResourceProxy::Image(img) => match self.images.get(&img.id) {
+                    Some(TransientImage::Cpu(..) | TransientImage::CpuOwned { .. }) => (),
+                    Some(TransientImage::Gpu(view)) => {
+                        if img.access == ImageAccess::WriteOnce {
+                            let id = img.id;
+                            let image = TransientImage::CpuOwned {
+                                data: RefCell::new(CpuTexture::new(img)),
+                                view,
+                            };
+                            self.images.insert(id, image);
+                        } else {
+                            panic!("image was already materialized on GPU")
+                        }
+                    }
+                    _ => bind_map.materialize_cpu_image(img),
+                },
             };
         }
         // Second pass takes immutable references
@@ -854,7 +1018,11 @@ impl<'a> TransientBindMap<'a> {
                     Some(TransientBuf::Cpu(b)) => CpuBinding::Buffer(b),
                     _ => bind_map.get_cpu_buf(buf.id),
                 },
-                ResourceProxy::Image(_) => todo!(),
+                ResourceProxy::Image(img) => match self.images.get(&img.id) {
+                    Some(TransientImage::Cpu(b)) => CpuBinding::Texture(b),
+                    Some(TransientImage::CpuOwned { data, .. }) => CpuBinding::TextureRW(data),
+                    _ => bind_map.get_cpu_image(img.id),
+                },
             })
             .collect()
     }