diff --git a/examples/cfd/lid_driven_cavity_2d.py b/examples/cfd/lid_driven_cavity_2d.py
index dfb1092..d790c2c 100644
--- a/examples/cfd/lid_driven_cavity_2d.py
+++ b/examples/cfd/lid_driven_cavity_2d.py
@@ -80,7 +80,8 @@ def run(self, num_steps, post_process_interval=100):
     def post_process(self, i):
         # Write the results. We'll use JAX backend for the post-processing
         if not isinstance(self.f_0, jnp.ndarray):
-            f_0 = wp.to_jax(self.f_0)
+            # If the backend is warp, we need to drop the last dimension added by warp for 2D simulations
+            f_0 = wp.to_jax(self.f_0)[..., 0]
         else:
             f_0 = self.f_0
 
diff --git a/examples/cfd/windtunnel_3d.py b/examples/cfd/windtunnel_3d.py
index c83e2c9..96c79f7 100644
--- a/examples/cfd/windtunnel_3d.py
+++ b/examples/cfd/windtunnel_3d.py
@@ -74,8 +74,8 @@ def define_boundary_indices(self):
         walls = [box["bottom"][i] + box["top"][i] + box["front"][i] + box["back"][i] for i in range(self.velocity_set.d)]
         walls = np.unique(np.array(walls), axis=-1).tolist()
 
-        # Load the mesh
-        stl_filename = "examples/cfd/stl-files/DrivAer-Notchback.stl"
+        # Load the mesh (replace with your own mesh)
+        stl_filename = "../stl-files/DrivAer-Notchback.stl"
         mesh = trimesh.load_mesh(stl_filename, process=False)
         mesh_vertices = mesh.vertices
 
diff --git a/requirements.txt b/requirements.txt
index ee107af..4d0cd2c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,15 +1,10 @@
-jax==0.4.20
-jaxlib==0.4.20
-matplotlib==3.8.0
-numpy==1.26.1
-pyvista==0.43.4
-Rtree==1.0.1
-trimesh==4.4.1
-orbax-checkpoint==0.4.1
-termcolor==2.3.0
-PhantomGaze @ git+https://github.com/loliverhennigh/PhantomGaze.git
-tqdm==4.66.2
-warp-lang==1.0.2
-numpy-stl==3.1.1
-pydantic==2.7.0
-ruff==0.5.6
\ No newline at end of file
+jax[cuda]
+matplotlib
+numpy
+pyvista
+Rtree
+trimesh
+warp-lang
+numpy-stl
+pydantic
+ruff
\ No newline at end of file
diff --git a/tests/boundary_conditions/bc_equilibrium/test_bc_equilibrium_warp.py b/tests/boundary_conditions/bc_equilibrium/test_bc_equilibrium_warp.py
index 5eb0c10..6bd9311 100644
--- a/tests/boundary_conditions/bc_equilibrium/test_bc_equilibrium_warp.py
+++ b/tests/boundary_conditions/bc_equilibrium/test_bc_equilibrium_warp.py
@@ -72,7 +72,7 @@ def test_bc_equilibrium_warp(dim, velocity_set, grid_shape):
     f = f.numpy()
     f_post = f_post.numpy()
 
-    assert f.shape == (velocity_set.q,) + grid_shape
+    assert f.shape == (velocity_set.q,) + grid_shape if dim == 3 else (velocity_set.q, grid_shape[0], grid_shape[1], 1)
 
     # Assert that the values are correct in the indices of the sphere
     weights = velocity_set.w
diff --git a/tests/boundary_conditions/bc_fullway_bounce_back/test_bc_fullway_bounce_back_warp.py b/tests/boundary_conditions/bc_fullway_bounce_back/test_bc_fullway_bounce_back_warp.py
index 10b9244..59c6c9d 100644
--- a/tests/boundary_conditions/bc_fullway_bounce_back/test_bc_fullway_bounce_back_warp.py
+++ b/tests/boundary_conditions/bc_fullway_bounce_back/test_bc_fullway_bounce_back_warp.py
@@ -58,7 +58,10 @@ def test_fullway_bounce_back_warp(dim, velocity_set, grid_shape):
     bc_mask, missing_mask = indices_boundary_masker([fullway_bc], bc_mask, missing_mask, start_index=None)
 
     # Generate a random field with the same shape
-    random_field = np.random.rand(velocity_set.q, *grid_shape).astype(np.float32)
+    if dim == 2:
+        random_field = np.random.rand(velocity_set.q, grid_shape[0], grid_shape[1], 1).astype(np.float32)
+    else:
+        random_field = np.random.rand(velocity_set.q, grid_shape[0], grid_shape[1], grid_shape[2]).astype(np.float32)
     # Add the random field to f_pre
     f_pre = wp.array(random_field)
 
@@ -71,7 +74,7 @@ def test_fullway_bounce_back_warp(dim, velocity_set, grid_shape):
     f = f_pre.numpy()
     f_post = f_post.numpy()
 
-    assert f.shape == (velocity_set.q,) + grid_shape
+    assert f.shape == (velocity_set.q,) + grid_shape if dim == 3 else (velocity_set.q, grid_shape[0], grid_shape[1], 1)
 
     for i in range(velocity_set.q):
         np.allclose(
diff --git a/tests/grids/test_grid_warp.py b/tests/grids/test_grid_warp.py
index 782434d..61b27d4 100644
--- a/tests/grids/test_grid_warp.py
+++ b/tests/grids/test_grid_warp.py
@@ -22,8 +22,10 @@ def test_warp_grid_create_field(grid_size):
         init_xlb_env(xlb.velocity_set.D3Q19)
         my_grid = grid_factory(grid_shape)
         f = my_grid.create_field(cardinality=9, dtype=Precision.FP32)
-
-        assert f.shape == (9,) + grid_shape, "Field shape is incorrect"
+        if len(grid_shape) == 2:
+            assert f.shape == (9,) + grid_shape + (1,), "Field shape is incorrect got {}".format(f.shape)
+        else:
+            assert f.shape == (9,) + grid_shape, "Field shape is incorrect got {}".format(f.shape)
         assert isinstance(f, wp.array), "Field should be a Warp ndarray"
 
 
@@ -37,7 +39,6 @@ def test_warp_grid_create_field_fill_value():
     assert isinstance(f, wp.array), "Field should be a Warp ndarray"
 
     f = f.numpy()
-    assert f.shape == (9,) + grid_shape, "Field shape is incorrect"
     assert np.allclose(f, fill_value), "Field not properly initialized with fill_value"
 
 
diff --git a/tests/kernels/stream/test_stream_warp.py b/tests/kernels/stream/test_stream_warp.py
index 0d100cf..95fcc05 100644
--- a/tests/kernels/stream/test_stream_warp.py
+++ b/tests/kernels/stream/test_stream_warp.py
@@ -61,7 +61,7 @@ def test_stream_operator_warp(dim, velocity_set, grid_shape):
     expected = jnp.stack(expected, axis=0)
 
     if dim == 2:
-        f_initial_warp = wp.array(f_initial)
+        f_initial_warp = wp.array(f_initial[..., np.newaxis])
 
     elif dim == 3:
         f_initial_warp = wp.array(f_initial)
@@ -71,7 +71,10 @@ def test_stream_operator_warp(dim, velocity_set, grid_shape):
     f_streamed = my_grid_warp.create_field(cardinality=velocity_set.q)
     f_streamed = stream_op(f_initial_warp, f_streamed)
 
-    assert jnp.allclose(f_streamed.numpy(), np.array(expected)), "Streaming did not occur as expected"
+    if len(grid_shape) == 2:
+        assert jnp.allclose(f_streamed.numpy()[..., 0], np.array(expected)), "Streaming did not occur as expected"
+    else:
+        assert jnp.allclose(f_streamed.numpy(), np.array(expected)), "Streaming did not occur as expected"
 
 
 if __name__ == "__main__":
diff --git a/xlb/grid/warp_grid.py b/xlb/grid/warp_grid.py
index 5018962..c74fc2f 100644
--- a/xlb/grid/warp_grid.py
+++ b/xlb/grid/warp_grid.py
@@ -21,7 +21,9 @@ def create_field(
         fill_value=None,
     ):
         dtype = dtype.wp_dtype if dtype else DefaultConfig.default_precision_policy.store_precision.wp_dtype
-        shape = (cardinality,) + (self.shape)
+
+        # Check if shape is 2D, and if so, append a singleton dimension to the shape
+        shape = (cardinality,) + (self.shape if len(self.shape) != 2 else self.shape + (1,))
 
         if fill_value is None:
             f = wp.zeros(shape, dtype=dtype)
diff --git a/xlb/operator/boundary_condition/bc_do_nothing.py b/xlb/operator/boundary_condition/bc_do_nothing.py
index 4b7ac90..56a332f 100644
--- a/xlb/operator/boundary_condition/bc_do_nothing.py
+++ b/xlb/operator/boundary_condition/bc_do_nothing.py
@@ -64,58 +64,7 @@ def functional(
         ):
             return f_pre
 
-        @wp.kernel
-        def kernel2d(
-            f_pre: wp.array3d(dtype=Any),
-            f_post: wp.array3d(dtype=Any),
-            bc_mask: wp.array3d(dtype=wp.uint8),
-            missing_mask: wp.array3d(dtype=wp.uint8),
-        ):
-            # Get the global index
-            i, j = wp.tid()
-            index = wp.vec2i(i, j)
-
-            # read tid data
-            _f_pre, _f_post, _boundary_id, _missing_mask = self._get_thread_data_2d(f_pre, f_post, bc_mask, missing_mask, index)
-
-            # Apply the boundary condition
-            if _boundary_id == wp.uint8(DoNothingBC.id):
-                timestep = 0
-                _f = functional(index, timestep, _missing_mask, f_pre, f_post, _f_pre, _f_post)
-            else:
-                _f = _f_post
-
-            # Write the result
-            for l in range(self.velocity_set.q):
-                f_post[l, index[0], index[1]] = self.store_dtype(_f[l])
-
-        # Construct the warp kernel
-        @wp.kernel
-        def kernel3d(
-            f_pre: wp.array4d(dtype=Any),
-            f_post: wp.array4d(dtype=Any),
-            bc_mask: wp.array4d(dtype=wp.uint8),
-            missing_mask: wp.array4d(dtype=wp.bool),
-        ):
-            # Get the global index
-            i, j, k = wp.tid()
-            index = wp.vec3i(i, j, k)
-
-            # read tid data
-            _f_pre, _f_post, _boundary_id, _missing_mask = self._get_thread_data_3d(f_pre, f_post, bc_mask, missing_mask, index)
-
-            # Apply the boundary condition
-            if _boundary_id == wp.uint8(DoNothingBC.id):
-                timestep = 0
-                _f = functional(index, timestep, _missing_mask, f_pre, f_post, _f_pre, _f_post)
-            else:
-                _f = _f_post
-
-            # Write the result
-            for l in range(self.velocity_set.q):
-                f_post[l, index[0], index[1], index[2]] = self.store_dtype(_f[l])
-
-        kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
+        kernel = self._construct_kernel(functional)
 
         return functional, kernel
 
@@ -127,4 +76,4 @@ def warp_implementation(self, f_pre, f_post, bc_mask, missing_mask):
             inputs=[f_pre, f_post, bc_mask, missing_mask],
             dim=f_pre.shape[1:],
         )
-        return f_post
+        return f_post
\ No newline at end of file
diff --git a/xlb/operator/boundary_condition/bc_equilibrium.py b/xlb/operator/boundary_condition/bc_equilibrium.py
index 4dd4b9e..77f408f 100644
--- a/xlb/operator/boundary_condition/bc_equilibrium.py
+++ b/xlb/operator/boundary_condition/bc_equilibrium.py
@@ -88,59 +88,8 @@ def functional(
             _f = self.equilibrium_operator.warp_functional(_rho, _u)
             return _f
 
-        # Construct the warp kernel
-        @wp.kernel
-        def kernel2d(
-            f_pre: wp.array3d(dtype=Any),
-            f_post: wp.array3d(dtype=Any),
-            bc_mask: wp.array3d(dtype=wp.uint8),
-            missing_mask: wp.array3d(dtype=wp.bool),
-        ):
-            # Get the global index
-            i, j = wp.tid()
-            index = wp.vec2i(i, j)
-
-            # read tid data
-            _f_pre, _f_post, _boundary_id, _missing_mask = self._get_thread_data_2d(f_pre, f_post, bc_mask, missing_mask, index)
-
-            # Apply the boundary condition
-            if _boundary_id == wp.uint8(EquilibriumBC.id):
-                timestep = 0
-                _f = functional(index, timestep, _missing_mask, f_pre, f_post, _f_pre, _f_post)
-            else:
-                _f = _f_post
-
-            # Write the result
-            for l in range(self.velocity_set.q):
-                f_post[l, index[0], index[1]] = self.store_dtype(_f[l])
-
-        # Construct the warp kernel
-        @wp.kernel
-        def kernel3d(
-            f_pre: wp.array4d(dtype=Any),
-            f_post: wp.array4d(dtype=Any),
-            bc_mask: wp.array4d(dtype=wp.uint8),
-            missing_mask: wp.array4d(dtype=wp.bool),
-        ):
-            # Get the global index
-            i, j, k = wp.tid()
-            index = wp.vec3i(i, j, k)
-
-            # read tid data
-            _f_pre, _f_post, _boundary_id, _missing_mask = self._get_thread_data_3d(f_pre, f_post, bc_mask, missing_mask, index)
-
-            # Apply the boundary condition
-            if _boundary_id == wp.uint8(EquilibriumBC.id):
-                timestep = 0
-                _f = functional(index, timestep, _missing_mask, f_pre, f_post, _f_pre, _f_post)
-            else:
-                _f = _f_post
-
-            # Write the result
-            for l in range(self.velocity_set.q):
-                f_post[l, index[0], index[1], index[2]] = self.store_dtype(_f[l])
-
-        kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
+        # Use the parent class's kernel and pass the functional
+        kernel = self._construct_kernel(functional)
 
         return functional, kernel
 
@@ -152,4 +101,4 @@ def warp_implementation(self, f_pre, f_post, bc_mask, missing_mask):
             inputs=[f_pre, f_post, bc_mask, missing_mask],
             dim=f_pre.shape[1:],
         )
-        return f_post
+        return f_post
\ No newline at end of file
diff --git a/xlb/operator/boundary_condition/bc_extrapolation_outflow.py b/xlb/operator/boundary_condition/bc_extrapolation_outflow.py
index 53645c6..38657e5 100644
--- a/xlb/operator/boundary_condition/bc_extrapolation_outflow.py
+++ b/xlb/operator/boundary_condition/bc_extrapolation_outflow.py
@@ -140,15 +140,7 @@ def _construct_warp(self):
         _opp_indices = self.velocity_set.opp_indices
 
         @wp.func
-        def get_normal_vectors_2d(
-            missing_mask: Any,
-        ):
-            for l in range(_q):
-                if missing_mask[l] == wp.uint8(1) and wp.abs(_c[0, l]) + wp.abs(_c[1, l]) == 1:
-                    return -wp.vec2i(_c[0, l], _c[1, l])
-
-        @wp.func
-        def get_normal_vectors_3d(
+        def get_normal_vectors(
             missing_mask: Any,
         ):
             for l in range(_q):
@@ -175,34 +167,7 @@ def functional(
             return _f
 
         @wp.func
-        def prepare_bc_auxilary_data_2d(
-            index: Any,
-            timestep: Any,
-            missing_mask: Any,
-            f_0: Any,
-            f_1: Any,
-            f_pre: Any,
-            f_post: Any,
-        ):
-            # Preparing the formulation for this BC using the neighbour's populations stored in f_aux and
-            # f_pre (post-streaming values of the current voxel). We use directions that leave the domain
-            # for storing this prepared data.
-            _f = f_post
-            nv = get_normal_vectors_2d(missing_mask)
-            for l in range(self.velocity_set.q):
-                if missing_mask[l] == wp.uint8(1):
-                    # f_0 is the post-collision values of the current time-step
-                    # Get pull index associated with the "neighbours" pull_index
-                    pull_index = type(index)()
-                    for d in range(self.velocity_set.d):
-                        pull_index[d] = index[d] - (_c[d, l] + nv[d])
-                    # The following is the post-streaming values of the neighbor cell
-                    f_aux = self.compute_dtype(f_0[l, pull_index[0], pull_index[1]])
-                    _f[_opp_indices[l]] = (self.compute_dtype(1.0) - sound_speed) * f_pre[l] + sound_speed * f_aux
-            return _f
-
-        @wp.func
-        def prepare_bc_auxilary_data_3d(
+        def prepare_bc_auxilary_data(
             index: Any,
             timestep: Any,
             missing_mask: Any,
@@ -215,7 +180,7 @@ def prepare_bc_auxilary_data_3d(
             # f_pre (post-streaming values of the current voxel). We use directions that leave the domain
             # for storing this prepared data.
             _f = f_post
-            nv = get_normal_vectors_3d(missing_mask)
+            nv = get_normal_vectors(missing_mask)
             for l in range(self.velocity_set.q):
                 if missing_mask[l] == wp.uint8(1):
                     # f_0 is the post-collision values of the current time-step
@@ -228,73 +193,7 @@ def prepare_bc_auxilary_data_3d(
                     _f[_opp_indices[l]] = (self.compute_dtype(1.0) - sound_speed) * f_pre[l] + sound_speed * f_aux
             return _f
 
-        # Construct the warp kernel
-        @wp.kernel
-        def kernel2d(
-            f_pre: wp.array3d(dtype=Any),
-            f_post: wp.array3d(dtype=Any),
-            bc_mask: wp.array3d(dtype=wp.uint8),
-            missing_mask: wp.array3d(dtype=wp.bool),
-        ):
-            # Get the global index
-            i, j = wp.tid()
-            index = wp.vec2i(i, j)
-            timestep = 0
-
-            # read tid data
-            _f_pre, _f_post, _boundary_id, _missing_mask = self._get_thread_data_2d(f_pre, f_post, bc_mask, missing_mask, index)
-
-            # special preparation of auxiliary data
-            if _boundary_id == wp.uint8(ExtrapolationOutflowBC.id):
-                _f_pre = prepare_bc_auxilary_data_2d(index, timestep, missing_mask, f_pre, f_post, f_pre, f_post)
-
-            # Apply the boundary condition
-            if _boundary_id == wp.uint8(ExtrapolationOutflowBC.id):
-                # TODO: is there any way for this BC to have a meaningful kernel given that it has two steps after both
-                # collision and streaming?
-                _f = functional(index, timestep, _missing_mask, f_pre, f_post, _f_pre, _f_post)
-            else:
-                _f = _f_post
-
-            # Write the distribution function
-            for l in range(self.velocity_set.q):
-                f_post[l, index[0], index[1]] = self.store_dtype(_f[l])
-
-        # Construct the warp kernel
-        @wp.kernel
-        def kernel3d(
-            f_pre: wp.array4d(dtype=Any),
-            f_post: wp.array4d(dtype=Any),
-            bc_mask: wp.array4d(dtype=wp.uint8),
-            missing_mask: wp.array4d(dtype=wp.bool),
-        ):
-            # Get the global index
-            i, j, k = wp.tid()
-            index = wp.vec3i(i, j, k)
-            timestep = 0
-
-            # read tid data
-            _f_pre, _f_post, _boundary_id, _missing_mask = self._get_thread_data_3d(f_pre, f_post, bc_mask, missing_mask, index)
-            _f_aux = _f_vec()
-
-            # special preparation of auxiliary data
-            if _boundary_id == wp.uint8(ExtrapolationOutflowBC.id):
-                _f_pre = prepare_bc_auxilary_data_3d(index, timestep, missing_mask, f_pre, f_post, f_pre, f_post)
-
-            # Apply the boundary condition
-            if _boundary_id == wp.uint8(ExtrapolationOutflowBC.id):
-                # TODO: is there any way for this BC to have a meaninful kernel given that it has two steps after both
-                # collision and streaming?
-                _f = functional(index, timestep, _missing_mask, f_pre, f_post, _f_pre, _f_post)
-            else:
-                _f = _f_post
-
-            # Write the distribution function
-            for l in range(self.velocity_set.q):
-                f_post[l, index[0], index[1], index[2]] = self.store_dtype(_f[l])
-
-        kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
-        prepare_bc_auxilary_data = prepare_bc_auxilary_data_3d if self.velocity_set.d == 3 else prepare_bc_auxilary_data_2d
+        kernel = self._construct_kernel(functional)
 
         return (functional, prepare_bc_auxilary_data), kernel
 
@@ -306,4 +205,4 @@ def warp_implementation(self, f_pre, f_post, bc_mask, missing_mask):
             inputs=[f_pre, f_post, bc_mask, missing_mask],
             dim=f_pre.shape[1:],
         )
-        return f_post
+        return f_post
\ No newline at end of file
diff --git a/xlb/operator/boundary_condition/bc_fullway_bounce_back.py b/xlb/operator/boundary_condition/bc_fullway_bounce_back.py
index 8569e84..19a3013 100644
--- a/xlb/operator/boundary_condition/bc_fullway_bounce_back.py
+++ b/xlb/operator/boundary_condition/bc_fullway_bounce_back.py
@@ -74,57 +74,7 @@ def functional(
                 fliped_f[l] = f_pre[_opp_indices[l]]
             return fliped_f
 
-        @wp.kernel
-        def kernel2d(
-            f_pre: wp.array3d(dtype=Any),
-            f_post: wp.array3d(dtype=Any),
-            bc_mask: wp.array3d(dtype=wp.uint8),
-            missing_mask: wp.array3d(dtype=wp.bool),
-        ):  # Get the global index
-            i, j = wp.tid()
-            index = wp.vec2i(i, j)
-
-            # read tid data
-            _f_pre, _f_post, _boundary_id, _missing_mask = self._get_thread_data_2d(f_pre, f_post, bc_mask, missing_mask, index)
-
-            # Check if the boundary is active
-            if _boundary_id == wp.uint8(FullwayBounceBackBC.id):
-                timestep = 0
-                _f = functional(index, timestep, _missing_mask, f_pre, f_post, _f_pre, _f_post)
-            else:
-                _f = _f_post
-
-            # Write the result to the output
-            for l in range(self.velocity_set.q):
-                f_post[l, index[0], index[1]] = self.store_dtype(_f[l])
-
-        # Construct the warp kernel
-        @wp.kernel
-        def kernel3d(
-            f_pre: wp.array4d(dtype=Any),
-            f_post: wp.array4d(dtype=Any),
-            bc_mask: wp.array4d(dtype=wp.uint8),
-            missing_mask: wp.array4d(dtype=wp.bool),
-        ):
-            # Get the global index
-            i, j, k = wp.tid()
-            index = wp.vec3i(i, j, k)
-
-            # read tid data
-            _f_pre, _f_post, _boundary_id, _missing_mask = self._get_thread_data_3d(f_pre, f_post, bc_mask, missing_mask, index)
-
-            # Check if the boundary is active
-            if _boundary_id == wp.uint8(FullwayBounceBackBC.id):
-                timestep = 0
-                _f = functional(index, timestep, _missing_mask, f_pre, f_post, _f_pre, _f_post)
-            else:
-                _f = _f_post
-
-            # Write the result to the output
-            for l in range(self.velocity_set.q):
-                f_post[l, index[0], index[1], index[2]] = self.store_dtype(_f[l])
-
-        kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
+        kernel = self._construct_kernel(functional)
 
         return functional, kernel
 
@@ -136,4 +86,4 @@ def warp_implementation(self, f_pre, f_post, bc_mask, missing_mask):
             inputs=[f_pre, f_post, bc_mask, missing_mask],
             dim=f_pre.shape[1:],
         )
-        return f_post
+        return f_post
\ No newline at end of file
diff --git a/xlb/operator/boundary_condition/bc_grads_approximation.py b/xlb/operator/boundary_condition/bc_grads_approximation.py
index bc29851..94ddba3 100644
--- a/xlb/operator/boundary_condition/bc_grads_approximation.py
+++ b/xlb/operator/boundary_condition/bc_grads_approximation.py
@@ -307,37 +307,10 @@ def functional_method2(
             f_post = grads_approximate_fpop(missing_mask, rho_target, u_target, f_post)
             return f_post
 
-        # Construct the warp kernel
-        @wp.kernel
-        def kernel(
-            f_pre: wp.array4d(dtype=Any),
-            f_post: wp.array4d(dtype=Any),
-            bc_mask: wp.array4d(dtype=wp.uint8),
-            missing_mask: wp.array4d(dtype=wp.bool),
-        ):
-            # Get the global index
-            i, j, k = wp.tid()
-            index = wp.vec3i(i, j, k)
-            timestep = 0
-
-            # read tid data
-            _f_pre, _f_post, _boundary_id, _missing_mask = self._get_thread_data_3d(f_pre, f_post, bc_mask, missing_mask, index)
-            _f_aux = _f_vec()
-
-            # Apply the boundary condition
-            if _boundary_id == wp.uint8(GradsApproximationBC.id):
-                # TODO: is there any way for this BC to have a meaninful kernel given that it has two steps after both
-                # collision and streaming?
-                _f = functional(index, timestep, _missing_mask, f_pre, f_post, _f_pre, _f_post)
-            else:
-                _f = _f_post
-
-            # Write the distribution function
-            for l in range(self.velocity_set.q):
-                f_post[l, index[0], index[1], index[2]] = self.store_dtype(_f[l])
-
         functional = functional_method1
 
+        kernel = self._construct_kernel(functional)
+
         return functional, kernel
 
     @Operator.register_backend(ComputeBackend.WARP)
@@ -348,4 +321,4 @@ def warp_implementation(self, f_pre, f_post, bc_mask, missing_mask):
             inputs=[f_pre, f_post, bc_mask, missing_mask],
             dim=f_pre.shape[1:],
         )
-        return f_post
+        return f_post
\ No newline at end of file
diff --git a/xlb/operator/boundary_condition/bc_halfway_bounce_back.py b/xlb/operator/boundary_condition/bc_halfway_bounce_back.py
index e8df6b7..bf04af0 100644
--- a/xlb/operator/boundary_condition/bc_halfway_bounce_back.py
+++ b/xlb/operator/boundary_condition/bc_halfway_bounce_back.py
@@ -87,59 +87,7 @@ def functional(
 
             return _f
 
-        # Construct the warp kernel
-        @wp.kernel
-        def kernel2d(
-            f_pre: wp.array3d(dtype=Any),
-            f_post: wp.array3d(dtype=Any),
-            bc_mask: wp.array3d(dtype=wp.uint8),
-            missing_mask: wp.array3d(dtype=wp.bool),
-        ):
-            # Get the global index
-            i, j = wp.tid()
-            index = wp.vec2i(i, j)
-
-            # read tid data
-            _f_pre, _f_post, _boundary_id, _missing_mask = self._get_thread_data_2d(f_pre, f_post, bc_mask, missing_mask, index)
-
-            # Apply the boundary condition
-            if _boundary_id == wp.uint8(HalfwayBounceBackBC.id):
-                timestep = 0
-                _f = functional(index, timestep, _missing_mask, f_pre, f_post, _f_pre, _f_post)
-            else:
-                _f = _f_post
-
-            # Write the distribution function
-            for l in range(self.velocity_set.q):
-                f_post[l, index[0], index[1]] = self.store_dtype(_f[l])
-
-        # Construct the warp kernel
-        @wp.kernel
-        def kernel3d(
-            f_pre: wp.array4d(dtype=Any),
-            f_post: wp.array4d(dtype=Any),
-            bc_mask: wp.array4d(dtype=wp.uint8),
-            missing_mask: wp.array4d(dtype=wp.bool),
-        ):
-            # Get the global index
-            i, j, k = wp.tid()
-            index = wp.vec3i(i, j, k)
-
-            # read tid data
-            _f_pre, _f_post, _boundary_id, _missing_mask = self._get_thread_data_3d(f_pre, f_post, bc_mask, missing_mask, index)
-
-            # Apply the boundary condition
-            if _boundary_id == wp.uint8(HalfwayBounceBackBC.id):
-                timestep = 0
-                _f = functional(index, timestep, _missing_mask, f_pre, f_post, _f_pre, _f_post)
-            else:
-                _f = _f_post
-
-            # Write the distribution function
-            for l in range(self.velocity_set.q):
-                f_post[l, index[0], index[1], index[2]] = self.store_dtype(_f[l])
-
-        kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
+        kernel = self._construct_kernel(functional)
 
         return functional, kernel
 
diff --git a/xlb/operator/boundary_condition/bc_regularized.py b/xlb/operator/boundary_condition/bc_regularized.py
index 065a0b0..af4c783 100644
--- a/xlb/operator/boundary_condition/bc_regularized.py
+++ b/xlb/operator/boundary_condition/bc_regularized.py
@@ -159,15 +159,7 @@ def _get_fsum(
             return fsum_known + fsum_middle
 
         @wp.func
-        def get_normal_vectors_2d(
-            missing_mask: Any,
-        ):
-            for l in range(_q):
-                if missing_mask[l] == wp.uint8(1) and wp.abs(_c[0, l]) + wp.abs(_c[1, l]) == 1:
-                    return -_u_vec(_c_float[0, l], _c_float[1, l])
-
-        @wp.func
-        def get_normal_vectors_3d(
+        def get_normal_vectors(
             missing_mask: Any,
         ):
             for l in range(_q):
@@ -211,7 +203,7 @@ def regularize_fpop(
             return fpop
 
         @wp.func
-        def functional3d_velocity(
+        def functional_velocity(
             index: Any,
             timestep: Any,
             missing_mask: Any,
@@ -224,7 +216,7 @@ def functional3d_velocity(
             _f = f_post
 
             # Find normal vector
-            normals = get_normal_vectors_3d(missing_mask)
+            normals = get_normal_vectors(missing_mask)
 
             # calculate rho
             fsum = _get_fsum(_f, missing_mask)
@@ -242,7 +234,7 @@ def functional3d_velocity(
             return _f
 
         @wp.func
-        def functional3d_pressure(
+        def functional_pressure(
             index: Any,
             timestep: Any,
             missing_mask: Any,
@@ -255,7 +247,7 @@ def functional3d_pressure(
             _f = f_post
 
             # Find normal vector
-            normals = get_normal_vectors_3d(missing_mask)
+            normals = get_normal_vectors(missing_mask)
 
             # calculate velocity
             fsum = _get_fsum(_f, missing_mask)
@@ -270,127 +262,11 @@ def functional3d_pressure(
             _f = regularize_fpop(_f, feq)
             return _f
 
-        @wp.func
-        def functional2d_velocity(
-            index: Any,
-            timestep: Any,
-            missing_mask: Any,
-            f_0: Any,
-            f_1: Any,
-            f_pre: Any,
-            f_post: Any,
-        ):
-            # Post-streaming values are only modified at missing direction
-            _f = f_post
-
-            # Find normal vector
-            normals = get_normal_vectors_2d(missing_mask)
-
-            # calculate rho
-            fsum = _get_fsum(_f, missing_mask)
-            unormal = self.compute_dtype(0.0)
-            for d in range(_d):
-                unormal += _u[d] * normals[d]
-            _rho = fsum / (self.compute_dtype(1.0) + unormal)
-
-            # impose non-equilibrium bounceback
-            feq = self.equilibrium_operator.warp_functional(_rho, _u)
-            _f = bounceback_nonequilibrium(_f, feq, missing_mask)
-
-            # Regularize the boundary fpop
-            _f = regularize_fpop(_f, feq)
-            return _f
-
-        @wp.func
-        def functional2d_pressure(
-            index: Any,
-            timestep: Any,
-            missing_mask: Any,
-            f_0: Any,
-            f_1: Any,
-            f_pre: Any,
-            f_post: Any,
-        ):
-            # Post-streaming values are only modified at missing direction
-            _f = f_post
-
-            # Find normal vector
-            normals = get_normal_vectors_2d(missing_mask)
-
-            # calculate velocity
-            fsum = _get_fsum(_f, missing_mask)
-            unormal = -self.compute_dtype(1.0) + fsum / _rho
-            _u = unormal * normals
-
-            # impose non-equilibrium bounceback
-            feq = self.equilibrium_operator.warp_functional(_rho, _u)
-            _f = bounceback_nonequilibrium(_f, feq, missing_mask)
-
-            # Regularize the boundary fpop
-            _f = regularize_fpop(_f, feq)
-            return _f
-
-        # Construct the warp kernel
-        @wp.kernel
-        def kernel2d(
-            f_pre: wp.array3d(dtype=Any),
-            f_post: wp.array3d(dtype=Any),
-            bc_mask: wp.array3d(dtype=wp.uint8),
-            missing_mask: wp.array3d(dtype=wp.bool),
-        ):
-            # Get the global index
-            i, j = wp.tid()
-            index = wp.vec2i(i, j)
-
-            # read tid data
-            _f_pre, _f_post, _boundary_id, _missing_mask = self._get_thread_data_2d(f_pre, f_post, bc_mask, missing_mask, index)
-
-            # Apply the boundary condition
-            if _boundary_id == wp.uint8(self.id):
-                timestep = 0
-                _f = functional(index, timestep, _missing_mask, f_pre, f_post, _f_pre, _f_post)
-            else:
-                _f = _f_post
-
-            # Write the distribution function
-            for l in range(self.velocity_set.q):
-                f_post[l, index[0], index[1]] = self.store_dtype(_f[l])
-
-        # Construct the warp kernel
-        @wp.kernel
-        def kernel3d(
-            f_pre: wp.array4d(dtype=Any),
-            f_post: wp.array4d(dtype=Any),
-            bc_mask: wp.array4d(dtype=wp.uint8),
-            missing_mask: wp.array4d(dtype=wp.bool),
-        ):
-            # Get the global index
-            i, j, k = wp.tid()
-            index = wp.vec3i(i, j, k)
-
-            # read tid data
-            _f_pre, _f_post, _boundary_id, _missing_mask = self._get_thread_data_3d(f_pre, f_post, bc_mask, missing_mask, index)
-
-            # Apply the boundary condition
-            if _boundary_id == wp.uint8(self.id):
-                timestep = 0
-                _f = functional(index, timestep, _missing_mask, f_pre, f_post, _f_pre, _f_post)
-            else:
-                _f = _f_post
-
-            # Write the distribution function
-            for l in range(self.velocity_set.q):
-                f_post[l, index[0], index[1], index[2]] = self.store_dtype(_f[l])
-
-        kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
-        if self.velocity_set.d == 3 and self.bc_type == "velocity":
-            functional = functional3d_velocity
-        elif self.velocity_set.d == 3 and self.bc_type == "pressure":
-            functional = functional3d_pressure
-        elif self.bc_type == "velocity":
-            functional = functional2d_velocity
-        else:
-            functional = functional2d_pressure
+        if self.bc_type == "velocity":
+            functional = functional_velocity
+        elif self.bc_type == "pressure":
+            functional = functional_pressure
+        kernel = self._construct_kernel(functional)
 
         return functional, kernel
 
diff --git a/xlb/operator/boundary_condition/bc_zouhe.py b/xlb/operator/boundary_condition/bc_zouhe.py
index 4be2cf2..a92d909 100644
--- a/xlb/operator/boundary_condition/bc_zouhe.py
+++ b/xlb/operator/boundary_condition/bc_zouhe.py
@@ -189,15 +189,6 @@ def _construct_warp(self):
         _c_float = self.velocity_set.c_float
         # TODO: this is way less than ideal. we should not be making new types
 
-        @wp.func
-        def get_normal_vectors_2d(
-            lattice_direction: Any,
-        ):
-            l = lattice_direction
-            if wp.abs(_c[0, l]) + wp.abs(_c[1, l]) == 1:
-                normals = -_u_vec(_c_float[0, l], _c_float[1, l])
-            return normals
-
         @wp.func
         def _get_fsum(
             fpop: Any,
@@ -213,7 +204,7 @@ def _get_fsum(
             return fsum_known + fsum_middle
 
         @wp.func
-        def get_normal_vectors_3d(
+        def get_normal_vectors(
             missing_mask: Any,
         ):
             for l in range(_q):
@@ -232,7 +223,7 @@ def bounceback_nonequilibrium(
             return fpop
 
         @wp.func
-        def functional3d_velocity(
+        def functional_velocity(
             index: Any,
             timestep: Any,
             missing_mask: Any,
@@ -245,7 +236,7 @@ def functional3d_velocity(
             _f = f_post
 
             # Find normal vector
-            normals = get_normal_vectors_3d(missing_mask)
+            normals = get_normal_vectors(missing_mask)
 
             # calculate rho
             fsum = _get_fsum(_f, missing_mask)
@@ -260,7 +251,7 @@ def functional3d_velocity(
             return _f
 
         @wp.func
-        def functional3d_pressure(
+        def functional_pressure(
             index: Any,
             timestep: Any,
             missing_mask: Any,
@@ -273,7 +264,7 @@ def functional3d_pressure(
             _f = f_post
 
             # Find normal vector
-            normals = get_normal_vectors_3d(missing_mask)
+            normals = get_normal_vectors(missing_mask)
 
             # calculate velocity
             fsum = _get_fsum(_f, missing_mask)
@@ -285,121 +276,14 @@ def functional3d_pressure(
             _f = bounceback_nonequilibrium(_f, feq, missing_mask)
             return _f
 
-        @wp.func
-        def functional2d_velocity(
-            index: Any,
-            timestep: Any,
-            missing_mask: Any,
-            f_0: Any,
-            f_1: Any,
-            f_pre: Any,
-            f_post: Any,
-        ):
-            # Post-streaming values are only modified at missing direction
-            _f = f_post
-
-            # Find normal vector
-            normals = get_normal_vectors_2d(missing_mask)
-
-            # calculate rho
-            fsum = _get_fsum(_f, missing_mask)
-            unormal = self.compute_dtype(0.0)
-            for d in range(_d):
-                unormal += _u[d] * normals[d]
-            _rho = fsum / (self.compute_dtype(1.0) + unormal)
-
-            # impose non-equilibrium bounceback
-            feq = self.equilibrium_operator.warp_functional(_rho, _u)
-            _f = bounceback_nonequilibrium(_f, feq, missing_mask)
-            return _f
-
-        @wp.func
-        def functional2d_pressure(
-            index: Any,
-            timestep: Any,
-            missing_mask: Any,
-            f_0: Any,
-            f_1: Any,
-            f_pre: Any,
-            f_post: Any,
-        ):
-            # Post-streaming values are only modified at missing direction
-            _f = f_post
-
-            # Find normal vector
-            normals = get_normal_vectors_2d(missing_mask)
-
-            # calculate velocity
-            fsum = _get_fsum(_f, missing_mask)
-            unormal = -self.compute_dtype(1.0) + fsum / _rho
-            _u = unormal * normals
-
-            # impose non-equilibrium bounceback
-            feq = self.equilibrium_operator.warp_functional(_rho, _u)
-            _f = bounceback_nonequilibrium(_f, feq, missing_mask)
-            return _f
-
-        # Construct the warp kernel
-        @wp.kernel
-        def kernel2d(
-            f_pre: wp.array3d(dtype=Any),
-            f_post: wp.array3d(dtype=Any),
-            bc_mask: wp.array3d(dtype=wp.uint8),
-            missing_mask: wp.array3d(dtype=wp.bool),
-        ):
-            # Get the global index
-            i, j = wp.tid()
-            index = wp.vec2i(i, j)
-
-            # read tid data
-            _f_pre, _f_post, _boundary_id, _missing_mask = self._get_thread_data_2d(f_pre, f_post, bc_mask, missing_mask, index)
-
-            # Apply the boundary condition
-            if _boundary_id == wp.uint8(self.id):
-                timestep = 0
-                _f = functional(index, timestep, _missing_mask, f_pre, f_post, _f_pre, _f_post)
-            else:
-                _f = _f_post
-
-            # Write the distribution function
-            for l in range(self.velocity_set.q):
-                f_post[l, index[0], index[1]] = self.store_dtype(_f[l])
-
-        # Construct the warp kernel
-        @wp.kernel
-        def kernel3d(
-            f_pre: wp.array4d(dtype=Any),
-            f_post: wp.array4d(dtype=Any),
-            bc_mask: wp.array4d(dtype=wp.uint8),
-            missing_mask: wp.array4d(dtype=wp.bool),
-        ):
-            # Get the global index
-            i, j, k = wp.tid()
-            index = wp.vec3i(i, j, k)
-
-            # read tid data
-            _f_pre, _f_post, _boundary_id, _missing_mask = self._get_thread_data_3d(f_pre, f_post, bc_mask, missing_mask, index)
-
-            # Apply the boundary condition
-            if _boundary_id == wp.uint8(self.id):
-                timestep = 0
-                _f = functional(index, timestep, _missing_mask, f_pre, f_post, _f_pre, _f_post)
-            else:
-                _f = _f_post
-
-            # Write the distribution function
-            for l in range(self.velocity_set.q):
-                f_post[l, index[0], index[1], index[2]] = self.store_dtype(_f[l])
-
-        kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
-        if self.velocity_set.d == 3 and self.bc_type == "velocity":
-            functional = functional3d_velocity
-        elif self.velocity_set.d == 3 and self.bc_type == "pressure":
-            functional = functional3d_pressure
+        if self.bc_type == "velocity":
+            functional = functional_velocity
+        elif self.bc_type == "pressure":
+            functional = functional_pressure
         elif self.bc_type == "velocity":
-            functional = functional2d_velocity
-        else:
-            functional = functional2d_pressure
+            functional = functional_pressure
+
+        kernel = self._construct_kernel(functional)
 
         return functional, kernel
 
diff --git a/xlb/operator/boundary_condition/boundary_condition.py b/xlb/operator/boundary_condition/boundary_condition.py
index 008cc9a..bf1eef2 100644
--- a/xlb/operator/boundary_condition/boundary_condition.py
+++ b/xlb/operator/boundary_condition/boundary_condition.py
@@ -75,32 +75,7 @@ def prepare_bc_auxilary_data(
             return f_post
 
         @wp.func
-        def _get_thread_data_2d(
-            f_pre: wp.array3d(dtype=Any),
-            f_post: wp.array3d(dtype=Any),
-            bc_mask: wp.array3d(dtype=wp.uint8),
-            missing_mask: wp.array3d(dtype=wp.bool),
-            index: wp.vec2i,
-        ):
-            # Get the boundary id and missing mask
-            _f_pre = _f_vec()
-            _f_post = _f_vec()
-            _boundary_id = bc_mask[0, index[0], index[1]]
-            _missing_mask = _missing_mask_vec()
-            for l in range(self.velocity_set.q):
-                # q-sized vector of populations
-                _f_pre[l] = self.compute_dtype(f_pre[l, index[0], index[1]])
-                _f_post[l] = self.compute_dtype(f_post[l, index[0], index[1]])
-
-                # TODO fix vec bool
-                if missing_mask[l, index[0], index[1]]:
-                    _missing_mask[l] = wp.uint8(1)
-                else:
-                    _missing_mask[l] = wp.uint8(0)
-            return _f_pre, _f_post, _boundary_id, _missing_mask
-
-        @wp.func
-        def _get_thread_data_3d(
+        def _get_thread_data(
             f_pre: wp.array4d(dtype=Any),
             f_post: wp.array4d(dtype=Any),
             bc_mask: wp.array4d(dtype=wp.uint8),
@@ -126,8 +101,7 @@ def _get_thread_data_3d(
 
         # Construct some helper warp functions for getting tid data
         if self.compute_backend == ComputeBackend.WARP:
-            self._get_thread_data_2d = _get_thread_data_2d
-            self._get_thread_data_3d = _get_thread_data_3d
+            self._get_thread_data = _get_thread_data
             self.prepare_bc_auxilary_data = prepare_bc_auxilary_data
 
     @partial(jit, static_argnums=(0,), inline=True)
@@ -137,3 +111,38 @@ def prepare_bc_auxilary_data(self, f_pre, f_post, bc_mask, missing_mask):
         currently being called after collision only.
         """
         return f_post
+
+    def _construct_kernel(self, functional):
+        """
+        Constructs the warp kernel for the boundary condition.
+        The functional is specific to each boundary condition and should be passed as an argument.
+        """
+        _id = wp.uint8(self.id)
+
+        # Construct the warp kernel
+        @wp.kernel
+        def kernel(
+            f_pre: wp.array4d(dtype=Any),
+            f_post: wp.array4d(dtype=Any),
+            bc_mask: wp.array4d(dtype=wp.uint8),
+            missing_mask: wp.array4d(dtype=wp.bool),
+        ):
+            # Get the global index
+            i, j, k = wp.tid()
+            index = wp.vec3i(i, j, k)
+
+            # read tid data
+            _f_pre, _f_post, _boundary_id, _missing_mask = self._get_thread_data(f_pre, f_post, bc_mask, missing_mask, index)
+
+            # Apply the boundary condition
+            if _boundary_id == _id:
+                timestep = 0
+                _f = functional(index, timestep, _missing_mask, f_pre, f_post, _f_pre, _f_post)
+            else:
+                _f = _f_post
+
+            # Write the result
+            for l in range(self.velocity_set.q):
+                f_post[l, index[0], index[1], index[2]] = self.store_dtype(_f[l])
+
+        return kernel
diff --git a/xlb/operator/boundary_masker/indices_boundary_masker.py b/xlb/operator/boundary_masker/indices_boundary_masker.py
index e7e50d7..0c1f7e1 100644
--- a/xlb/operator/boundary_masker/indices_boundary_masker.py
+++ b/xlb/operator/boundary_masker/indices_boundary_masker.py
@@ -31,18 +31,12 @@ def are_indices_in_interior(self, indices, shape):
         Check if each 2D or 3D index is inside the bounds of the domain with the given shape and not
         at its boundary.
 
-        :param indices: List of tuples, where each tuple contains indices for each dimension.
+        :param indices: Array of indices, where each column contains indices for each dimension.
         :param shape: Tuple representing the shape of the domain (nx, ny) for 2D or (nx, ny, nz) for 3D.
-        :return: List of boolean flags where each flag indicates whether the corresponding index is inside the bounds.
+        :return: Array of boolean flags where each flag indicates whether the corresponding index is inside the bounds.
         """
-        # Ensure that the number of dimensions in indices matches the domain shape
-        dim = len(shape)
-        if len(indices) != dim:
-            raise ValueError(f"Indices tuple must have {dim} dimensions to match the domain shape.")
-
-        # Check each index tuple and return a list of boolean flags
-        flags = [all(0 < idx[d] < shape[d] - 1 for d in range(dim)) for idx in np.array(indices).T]
-        return flags
+        shape_array = np.array(shape)
+        return np.all((indices > 0) & (indices < shape_array[:, np.newaxis] - 1), axis=0)
 
     @Operator.register_backend(ComputeBackend.JAX)
     # TODO HS: figure out why uncommenting the line below fails unlike other operators!
@@ -70,11 +64,12 @@ def jax_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
             assert bc.indices is not None, f"Please specify indices associated with the {bc.__class__.__name__} BC!"
             assert bc.mesh_vertices is None, f"Please use MeshBoundaryMasker operator if {bc.__class__.__name__} is imposed on a mesh (e.g. STL)!"
             id_number = bc.id
-            local_indices = np.array(bc.indices) - np.array(start_index)[:, np.newaxis]
+            bc_indices = np.array(bc.indices)
+            local_indices = bc_indices - np.array(start_index)[:, np.newaxis]
             padded_indices = local_indices + np.array(shift_tup)[:, np.newaxis]
             bmap = bmap.at[tuple(padded_indices)].set(id_number)
-            if any(self.are_indices_in_interior(bc.indices, domain_shape)) and bc.needs_padding:
-                # checking if all indices associated with this BC are in the interior of the domain (not at the boundary).
+            if any(self.are_indices_in_interior(bc_indices, domain_shape)) and bc.needs_padding:
+                # checking if all indices associated with this BC are in the interior of the domain.
                 # This flag is needed e.g. if the no-slip geometry is anywhere but at the boundaries of the computational domain.
                 if dim == 2:
                     grid_mask = grid_mask.at[:, padded_indices[0], padded_indices[1]].set(True)
@@ -108,59 +103,9 @@ def check_index_bounds(index: wp.vec3i, shape: wp.vec3i):
             is_in_bounds = index[0] >= 0 and index[0] < shape[0] and index[1] >= 0 and index[1] < shape[1] and index[2] >= 0 and index[2] < shape[2]
             return is_in_bounds
 
-        # Construct the warp 2D kernel
-        @wp.kernel
-        def kernel2d(
-            indices: wp.array2d(dtype=wp.int32),
-            id_number: wp.array1d(dtype=wp.uint8),
-            is_interior: wp.array1d(dtype=wp.bool),
-            bc_mask: wp.array3d(dtype=wp.uint8),
-            missing_mask: wp.array3d(dtype=wp.bool),
-            start_index: wp.vec2i,
-        ):
-            # Get the index of indices
-            ii = wp.tid()
-
-            # Get local indices
-            index = wp.vec2i()
-            index[0] = indices[0, ii] - start_index[0]
-            index[1] = indices[1, ii] - start_index[1]
-
-            # Check if index is in bounds
-            if index[0] >= 0 and index[0] < missing_mask.shape[1] and index[1] >= 0 and index[1] < missing_mask.shape[2]:
-                # Stream indices
-                for l in range(_q):
-                    # Get the index of the streaming direction
-                    pull_index = wp.vec2i()
-                    push_index = wp.vec2i()
-                    for d in range(self.velocity_set.d):
-                        pull_index[d] = index[d] - _c[d, l]
-                        push_index[d] = index[d] + _c[d, l]
-
-                    # set bc_mask for all bc indices
-                    bc_mask[0, index[0], index[1]] = id_number[ii]
-
-                    # check if pull index is out of bound
-                    # These directions will have missing information after streaming
-                    if pull_index[0] < 0 or pull_index[0] >= missing_mask.shape[1] or pull_index[1] < 0 or pull_index[1] >= missing_mask.shape[2]:
-                        # Set the missing mask
-                        missing_mask[l, index[0], index[1]] = True
-
-                    # handling geometries in the interior of the computational domain
-                    elif (
-                        is_interior[ii]
-                        and push_index[0] >= 0
-                        and push_index[0] < missing_mask.shape[1]
-                        and push_index[1] >= 0
-                        and push_index[1] < missing_mask.shape[2]
-                    ):
-                        # Set the missing mask
-                        missing_mask[l, push_index[0], push_index[1]] = True
-                        bc_mask[0, push_index[0], push_index[1]] = id_number[ii]
-
         # Construct the warp 3D kernel
         @wp.kernel
-        def kernel3d(
+        def kernel(
             indices: wp.array2d(dtype=wp.int32),
             id_number: wp.array1d(dtype=wp.uint8),
             is_interior: wp.array1d(dtype=wp.bool),
@@ -204,47 +149,72 @@ def kernel3d(
                         missing_mask[l, push_index[0], push_index[1], push_index[2]] = True
                         bc_mask[0, push_index[0], push_index[1], push_index[2]] = id_number[ii]
 
-        kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
-
         return None, kernel
 
     @Operator.register_backend(ComputeBackend.WARP)
     def warp_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
-        dim = self.velocity_set.d
-        index_list = [[] for _ in range(dim)]
-        id_list = []
-        is_interior = []
+        # Pre-allocate arrays with maximum possible size
+        max_size = sum(len(bc.indices[0]) if isinstance(bc.indices, list) else bc.indices.shape[1] for bc in bclist if bc.indices is not None)
+        indices = np.zeros((3, max_size), dtype=np.int32)
+        id_numbers = np.zeros(max_size, dtype=np.uint8)
+        is_interior = np.zeros(max_size, dtype=bool)
+
+        current_index = 0
         for bc in bclist:
             assert bc.indices is not None, f'Please specify indices associated with the {bc.__class__.__name__} BC using keyword "indices"!'
             assert bc.mesh_vertices is None, f"Please use MeshBoundaryMasker operator if {bc.__class__.__name__} is imposed on a mesh (e.g. STL)!"
-            for d in range(dim):
-                index_list[d] += bc.indices[d]
-            id_list += [bc.id] * len(bc.indices[0])
-            is_interior += self.are_indices_in_interior(bc.indices, bc_mask[0].shape) if bc.needs_padding else [False] * len(bc.indices[0])
 
-            # We are done with bc.indices. Remove them from BC objects
+            bc_indices = np.asarray(bc.indices)
+            num_indices = bc_indices.shape[1]
+
+            # Ensure indices are 3D
+            if bc_indices.shape[0] == 2:
+                bc_indices = np.vstack([bc_indices, np.zeros(num_indices, dtype=int)])
+
+            # Add indices to the pre-allocated array
+            indices[:, current_index : current_index + num_indices] = bc_indices
+
+            # Set id numbers
+            id_numbers[current_index : current_index + num_indices] = bc.id
+
+            # Set is_interior flags
+            if bc.needs_padding:
+                is_interior[current_index : current_index + num_indices] = self.are_indices_in_interior(bc_indices, bc_mask[0].shape)
+            else:
+                is_interior[current_index : current_index + num_indices] = False
+
+            current_index += num_indices
+
+            # Remove indices from BC objects
             bc.__dict__.pop("indices", None)
 
-        # convert to warp arrays
-        indices = wp.array2d(index_list, dtype=wp.int32)
-        id_number = wp.array1d(id_list, dtype=wp.uint8)
-        is_interior = wp.array1d(is_interior, dtype=wp.bool)
+        # Trim arrays to actual size
+        indices = indices[:, :current_index]
+        id_numbers = id_numbers[:current_index]
+        is_interior = is_interior[:current_index]
+
+        # Convert to Warp arrays
+        wp_indices = wp.array(indices, dtype=wp.int32)
+        wp_id_numbers = wp.array(id_numbers, dtype=wp.uint8)
+        wp_is_interior = wp.array(is_interior, dtype=wp.bool)
 
         if start_index is None:
-            start_index = (0,) * dim
+            start_index = wp.vec3i(0, 0, 0)
+        else:
+            start_index = wp.vec3i(*start_index)
 
         # Launch the warp kernel
         wp.launch(
             self.warp_kernel,
+            dim=current_index,
             inputs=[
-                indices,
-                id_number,
-                is_interior,
+                wp_indices,
+                wp_id_numbers,
+                wp_is_interior,
                 bc_mask,
                 missing_mask,
                 start_index,
             ],
-            dim=indices.shape[1],
         )
 
         return bc_mask, missing_mask
diff --git a/xlb/operator/collision/bgk.py b/xlb/operator/collision/bgk.py
index 60f63ef..115ed9a 100644
--- a/xlb/operator/collision/bgk.py
+++ b/xlb/operator/collision/bgk.py
@@ -36,34 +36,7 @@ def functional(f: Any, feq: Any, rho: Any, u: Any):
 
         # Construct the warp kernel
         @wp.kernel
-        def kernel2d(
-            f: wp.array3d(dtype=Any),
-            feq: wp.array3d(dtype=Any),
-            fout: wp.array3d(dtype=Any),
-            rho: wp.array3d(dtype=Any),
-            u: wp.array3d(dtype=Any),
-        ):
-            # Get the global index
-            i, j = wp.tid()
-            index = wp.vec2i(i, j)
-
-            # Load needed values
-            _f = _f_vec()
-            _feq = _f_vec()
-            for l in range(self.velocity_set.q):
-                _f[l] = f[l, index[0], index[1]]
-                _feq[l] = feq[l, index[0], index[1]]
-
-            # Compute the collision
-            _fout = functional(_f, _feq, rho, u)
-
-            # Write the result
-            for l in range(self.velocity_set.q):
-                fout[l, index[0], index[1]] = self.store_dtype(_fout[l])
-
-        # Construct the warp kernel
-        @wp.kernel
-        def kernel3d(
+        def kernel(
             f: wp.array4d(dtype=Any),
             feq: wp.array4d(dtype=Any),
             fout: wp.array4d(dtype=Any),
@@ -88,8 +61,6 @@ def kernel3d(
             for l in range(self.velocity_set.q):
                 fout[l, index[0], index[1], index[2]] = self.store_dtype(_fout[l])
 
-        kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
-
         return functional, kernel
 
     @Operator.register_backend(ComputeBackend.WARP)
diff --git a/xlb/operator/collision/forced_collision.py b/xlb/operator/collision/forced_collision.py
index 31ef392..2036bab 100644
--- a/xlb/operator/collision/forced_collision.py
+++ b/xlb/operator/collision/forced_collision.py
@@ -52,39 +52,7 @@ def functional(f: Any, feq: Any, rho: Any, u: Any):
 
         # Construct the warp kernel
         @wp.kernel
-        def kernel2d(
-            f: wp.array3d(dtype=Any),
-            feq: wp.array3d(dtype=Any),
-            fout: wp.array3d(dtype=Any),
-            rho: wp.array3d(dtype=Any),
-            u: wp.array3d(dtype=Any),
-        ):
-            # Get the global index
-            i, j = wp.tid()
-            index = wp.vec2i(i, j)  # TODO: Warp needs to fix this
-
-            # Load needed values
-            _f = _f_vec()
-            _feq = _f_vec()
-            _d = self.velocity_set.d
-            for l in range(self.velocity_set.q):
-                _f[l] = f[l, index[0], index[1]]
-                _feq[l] = feq[l, index[0], index[1]]
-            _u = _u_vec()
-            for l in range(_d):
-                _u[l] = u[l, index[0], index[1]]
-            _rho = rho[0, index[0], index[1]]
-
-            # Compute the collision
-            _fout = functional(_f, _feq, _rho, _u)
-
-            # Write the result
-            for l in range(self.velocity_set.q):
-                fout[l, index[0], index[1]] = _fout[l]
-
-        # Construct the warp kernel
-        @wp.kernel
-        def kernel3d(
+        def kernel(
             f: wp.array4d(dtype=Any),
             feq: wp.array4d(dtype=Any),
             fout: wp.array4d(dtype=Any),
@@ -114,8 +82,6 @@ def kernel3d(
             for l in range(self.velocity_set.q):
                 fout[l, index[0], index[1], index[2]] = _fout[l]
 
-        kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
-
         return functional, kernel
 
     @Operator.register_backend(ComputeBackend.WARP)
diff --git a/xlb/operator/collision/kbc.py b/xlb/operator/collision/kbc.py
index bbc5d15..5c5c29e 100644
--- a/xlb/operator/collision/kbc.py
+++ b/xlb/operator/collision/kbc.py
@@ -263,30 +263,7 @@ def entropic_scalar_product(
 
         # Construct the functional
         @wp.func
-        def functional2d(
-            f: Any,
-            feq: Any,
-            rho: Any,
-            u: Any,
-        ):
-            # Compute shear and delta_s
-            fneq = f - feq
-            shear = decompose_shear_d2q9(fneq)
-            delta_s = shear * rho  # TODO: Check this
-
-            # Perform collision
-            delta_h = fneq - delta_s
-            two = self.compute_dtype(2.0)
-            gamma = _inv_beta - (two - _inv_beta) * entropic_scalar_product(delta_s, delta_h, feq) / (
-                _epsilon + entropic_scalar_product(delta_h, delta_h, feq)
-            )
-            fout = f - _beta * (two * delta_s + gamma * delta_h)
-
-            return fout
-
-        # Construct the functional
-        @wp.func
-        def functional3d(
+        def functional(
             f: Any,
             feq: Any,
             rho: Any,
@@ -309,39 +286,7 @@ def functional3d(
 
         # Construct the warp kernel
         @wp.kernel
-        def kernel2d(
-            f: wp.array3d(dtype=Any),
-            feq: wp.array3d(dtype=Any),
-            fout: wp.array3d(dtype=Any),
-            rho: wp.array3d(dtype=Any),
-            u: wp.array3d(dtype=Any),
-        ):
-            # Get the global index
-            i, j = wp.tid()
-            index = wp.vec2i(i, j)  # TODO: Warp needs to fix this
-
-            # Load needed values
-            _f = _f_vec()
-            _feq = _f_vec()
-            _d = self.velocity_set.d
-            for l in range(self.velocity_set.q):
-                _f[l] = f[l, index[0], index[1]]
-                _feq[l] = feq[l, index[0], index[1]]
-            _u = _u_vec()
-            for l in range(_d):
-                _u[l] = u[l, index[0], index[1]]
-            _rho = rho[0, index[0], index[1]]
-
-            # Compute the collision
-            _fout = functional(_f, _feq, _rho, _u)
-
-            # Write the result
-            for l in range(self.velocity_set.q):
-                fout[l, index[0], index[1]] = self.store_dtype(_fout[l])
-
-        # Construct the warp kernel
-        @wp.kernel
-        def kernel3d(
+        def kernel(
             f: wp.array4d(dtype=Any),
             feq: wp.array4d(dtype=Any),
             fout: wp.array4d(dtype=Any),
@@ -371,9 +316,6 @@ def kernel3d(
             for l in range(self.velocity_set.q):
                 fout[l, index[0], index[1], index[2]] = self.store_dtype(_fout[l])
 
-        functional = functional3d if self.velocity_set.d == 3 else functional2d
-        kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
-
         return functional, kernel
 
     @Operator.register_backend(ComputeBackend.WARP)
diff --git a/xlb/operator/equilibrium/quadratic_equilibrium.py b/xlb/operator/equilibrium/quadratic_equilibrium.py
index ba337f0..62cc041 100644
--- a/xlb/operator/equilibrium/quadratic_equilibrium.py
+++ b/xlb/operator/equilibrium/quadratic_equilibrium.py
@@ -61,7 +61,7 @@ def functional(
 
         # Construct the warp kernel
         @wp.kernel
-        def kernel3d(
+        def kernel(
             rho: wp.array4d(dtype=Any),
             u: wp.array4d(dtype=Any),
             f: wp.array4d(dtype=Any),
@@ -81,29 +81,6 @@ def kernel3d(
             for l in range(self.velocity_set.q):
                 f[l, index[0], index[1], index[2]] = self.store_dtype(feq[l])
 
-        @wp.kernel
-        def kernel2d(
-            rho: wp.array3d(dtype=Any),
-            u: wp.array3d(dtype=Any),
-            f: wp.array3d(dtype=Any),
-        ):
-            # Get the global index
-            i, j = wp.tid()
-            index = wp.vec2i(i, j)
-
-            # Get the equilibrium
-            _u = _u_vec()
-            for d in range(self.velocity_set.d):
-                _u[d] = u[d, index[0], index[1]]
-            _rho = rho[0, index[0], index[1]]
-            feq = functional(_rho, _u)
-
-            # Set the output
-            for l in range(self.velocity_set.q):
-                f[l, index[0], index[1]] = self.store_dtype(feq[l])
-
-        kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
-
         return functional, kernel
 
     @Operator.register_backend(ComputeBackend.WARP)
diff --git a/xlb/operator/force/exact_difference_force.py b/xlb/operator/force/exact_difference_force.py
index b4da602..ec1ef5b 100644
--- a/xlb/operator/force/exact_difference_force.py
+++ b/xlb/operator/force/exact_difference_force.py
@@ -86,33 +86,7 @@ def functional(f_postcollision: Any, feq: Any, rho: Any, u: Any):
 
         # Construct the warp kernel
         @wp.kernel
-        def kernel2d(
-            f_postcollision: Any,
-            feq: Any,
-            fout: wp.array3d(dtype=Any),
-            rho: wp.array3d(dtype=Any),
-            u: wp.array3d(dtype=Any),
-        ):
-            # Get the global index
-            i, j = wp.tid()
-            index = wp.vec2i(i, j)
-
-            # Load needed values
-            _u = _u_vec()
-            for l in range(_d):
-                _u[l] = u[l, index[0], index[1]]
-            _rho = rho[0, index[0], index[1]]
-
-            # Compute the collision
-            _fout = functional(f_postcollision, feq, _rho, _u)
-
-            # Write the result
-            for l in range(self.velocity_set.q):
-                fout[l, index[0], index[1]] = self.store_dtype(_fout[l])
-
-        # Construct the warp kernel
-        @wp.kernel
-        def kernel3d(
+        def kernel(
             f_postcollision: Any,
             feq: Any,
             fout: wp.array4d(dtype=Any),
@@ -136,7 +110,6 @@ def kernel3d(
             for l in range(self.velocity_set.q):
                 fout[l, index[0], index[1], index[2]] = self.store_dtype(_fout[l])
 
-        kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
         return functional, kernel
 
     @Operator.register_backend(ComputeBackend.WARP)
diff --git a/xlb/operator/force/momentum_transfer.py b/xlb/operator/force/momentum_transfer.py
index 8b0aacf..dbd5307 100644
--- a/xlb/operator/force/momentum_transfer.py
+++ b/xlb/operator/force/momentum_transfer.py
@@ -102,62 +102,7 @@ def _construct_warp(self):
 
         # Construct the warp kernel
         @wp.kernel
-        def kernel2d(
-            f_0: wp.array3d(dtype=Any),
-            f_1: wp.array3d(dtype=Any),
-            bc_mask: wp.array3d(dtype=wp.uint8),
-            missing_mask: wp.array3d(dtype=wp.bool),
-            force: wp.array(dtype=Any),
-        ):
-            # Get the global index
-            i, j = wp.tid()
-            index = wp.vec2i(i, j)
-
-            # Get the boundary id
-            _boundary_id = bc_mask[0, index[0], index[1]]
-            _missing_mask = _missing_mask_vec()
-            for l in range(self.velocity_set.q):
-                # TODO fix vec bool
-                if missing_mask[l, index[0], index[1]]:
-                    _missing_mask[l] = wp.uint8(1)
-                else:
-                    _missing_mask[l] = wp.uint8(0)
-
-            # Determin if boundary is an edge by checking if center is missing
-            is_edge = wp.bool(False)
-            if _boundary_id == wp.uint8(_no_slip_id):
-                if _missing_mask[_zero_index] == wp.uint8(0):
-                    is_edge = wp.bool(True)
-
-            # If the boundary is an edge then add the momentum transfer
-            m = _u_vec()
-            if is_edge:
-                # Get the distribution function
-                f_post_collision = _f_vec()
-                for l in range(self.velocity_set.q):
-                    f_post_collision[l] = f_0[l, index[0], index[1]]
-
-                # Apply streaming (pull method)
-                timestep = 0
-                f_post_stream = self.stream.warp_functional(f_0, index)
-                f_post_stream = self.no_slip_bc_instance.warp_functional(index, timestep, _missing_mask, f_0, f_1, f_post_collision, f_post_stream)
-
-                # Compute the momentum transfer
-                for d in range(self.velocity_set.d):
-                    m[d] = self.compute_dtype(0.0)
-                    for l in range(self.velocity_set.q):
-                        if _missing_mask[l] == wp.uint8(1):
-                            phi = f_post_collision[_opp_indices[l]] + f_post_stream[l]
-                            if _c[d, _opp_indices[l]] == 1:
-                                m[d] += phi
-                            elif _c[d, _opp_indices[l]] == -1:
-                                m[d] -= phi
-
-            wp.atomic_add(force, 0, m)
-
-        # Construct the warp kernel
-        @wp.kernel
-        def kernel3d(
+        def kernel(
             f_0: wp.array4d(dtype=Any),
             f_1: wp.array4d(dtype=Any),
             bc_mask: wp.array4d(dtype=wp.uint8),
@@ -210,9 +155,6 @@ def kernel3d(
 
             wp.atomic_add(force, 0, m)
 
-        # Return the correct kernel
-        kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
-
         return None, kernel
 
     @Operator.register_backend(ComputeBackend.WARP)
diff --git a/xlb/operator/macroscopic/first_moment.py b/xlb/operator/macroscopic/first_moment.py
index 329a71f..cb99a9f 100644
--- a/xlb/operator/macroscopic/first_moment.py
+++ b/xlb/operator/macroscopic/first_moment.py
@@ -38,7 +38,7 @@ def functional(
             return u
 
         @wp.kernel
-        def kernel3d(
+        def kernel(
             f: wp.array4d(dtype=Any),
             rho: wp.array4d(dtype=Any),
             u: wp.array4d(dtype=Any),
@@ -55,26 +55,6 @@ def kernel3d(
             for d in range(self.velocity_set.d):
                 u[d, index[0], index[1], index[2]] = self.store_dtype(_u[d])
 
-        @wp.kernel
-        def kernel2d(
-            f: wp.array3d(dtype=Any),
-            rho: wp.array3d(dtype=Any),
-            u: wp.array3d(dtype=Any),
-        ):
-            i, j = wp.tid()
-            index = wp.vec2i(i, j)
-
-            _f = _f_vec()
-            for l in range(self.velocity_set.q):
-                _f[l] = f[l, index[0], index[1]]
-            _rho = rho[0, index[0], index[1]]
-            _u = functional(_f, _rho)
-
-            for d in range(self.velocity_set.d):
-                u[d, index[0], index[1]] = self.store_dtype(_u[d])
-
-        kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
-
         return functional, kernel
 
     @Operator.register_backend(ComputeBackend.WARP)
diff --git a/xlb/operator/macroscopic/macroscopic.py b/xlb/operator/macroscopic/macroscopic.py
index b574436..ab1193b 100644
--- a/xlb/operator/macroscopic/macroscopic.py
+++ b/xlb/operator/macroscopic/macroscopic.py
@@ -37,7 +37,7 @@ def functional(f: _f_vec):
             return rho, u
 
         @wp.kernel
-        def kernel3d(
+        def kernel(
             f: wp.array4d(dtype=Any),
             rho: wp.array4d(dtype=Any),
             u: wp.array4d(dtype=Any),
@@ -54,26 +54,6 @@ def kernel3d(
             for d in range(self.velocity_set.d):
                 u[d, index[0], index[1], index[2]] = self.store_dtype(_u[d])
 
-        @wp.kernel
-        def kernel2d(
-            f: wp.array3d(dtype=Any),
-            rho: wp.array3d(dtype=Any),
-            u: wp.array3d(dtype=Any),
-        ):
-            i, j = wp.tid()
-            index = wp.vec2i(i, j)
-
-            _f = _f_vec()
-            for l in range(self.velocity_set.q):
-                _f[l] = f[l, index[0], index[1]]
-            _rho, _u = functional(_f)
-
-            rho[0, index[0], index[1]] = self.store_dtype(_rho)
-            for d in range(self.velocity_set.d):
-                u[d, index[0], index[1]] = self.store_dtype(_u[d])
-
-        kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
-
         return functional, kernel
 
     @Operator.register_backend(ComputeBackend.WARP)
diff --git a/xlb/operator/macroscopic/second_moment.py b/xlb/operator/macroscopic/second_moment.py
index 687b38a..6c7e70e 100644
--- a/xlb/operator/macroscopic/second_moment.py
+++ b/xlb/operator/macroscopic/second_moment.py
@@ -79,7 +79,7 @@ def functional(
 
         # Construct the kernel
         @wp.kernel
-        def kernel3d(
+        def kernel(
             f: wp.array4d(dtype=Any),
             pi: wp.array4d(dtype=Any),
         ):
@@ -97,27 +97,6 @@ def kernel3d(
             for d in range(_pi_dim):
                 pi[d, index[0], index[1], index[2]] = self.store_dtype(_pi[d])
 
-        @wp.kernel
-        def kernel2d(
-            f: wp.array3d(dtype=Any),
-            pi: wp.array3d(dtype=Any),
-        ):
-            # Get the global index
-            i, j = wp.tid()
-            index = wp.vec2i(i, j)
-
-            # Get the equilibrium
-            _f = _f_vec()
-            for l in range(self.velocity_set.q):
-                _f[l] = f[l, index[0], index[1]]
-            _pi = functional(_f)
-
-            # Set the output
-            for d in range(_pi_dim):
-                pi[d, index[0], index[1]] = self.store_dtype(_pi[d])
-
-        kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
-
         return functional, kernel
 
     @Operator.register_backend(ComputeBackend.WARP)
diff --git a/xlb/operator/macroscopic/zero_moment.py b/xlb/operator/macroscopic/zero_moment.py
index d0fbf51..8abb4de 100644
--- a/xlb/operator/macroscopic/zero_moment.py
+++ b/xlb/operator/macroscopic/zero_moment.py
@@ -27,7 +27,7 @@ def functional(f: _f_vec):
             return rho
 
         @wp.kernel
-        def kernel3d(
+        def kernel(
             f: wp.array4d(dtype=Any),
             rho: wp.array4d(dtype=Any),
         ):
@@ -41,23 +41,6 @@ def kernel3d(
 
             rho[0, index[0], index[1], index[2]] = _rho
 
-        @wp.kernel
-        def kernel2d(
-            f: wp.array3d(dtype=Any),
-            rho: wp.array3d(dtype=Any),
-        ):
-            i, j = wp.tid()
-            index = wp.vec2i(i, j)
-
-            _f = _f_vec()
-            for l in range(self.velocity_set.q):
-                _f[l] = f[l, index[0], index[1]]
-            _rho = functional(_f)
-
-            rho[0, index[0], index[1]] = _rho
-
-        kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
-
         return functional, kernel
 
     @Operator.register_backend(ComputeBackend.WARP)
diff --git a/xlb/operator/stepper/nse_stepper.py b/xlb/operator/stepper/nse_stepper.py
index 99431eb..e08e95c 100644
--- a/xlb/operator/stepper/nse_stepper.py
+++ b/xlb/operator/stepper/nse_stepper.py
@@ -140,27 +140,7 @@ def apply_bc(
             return f_result
 
         @wp.func
-        def get_thread_data_2d(
-            f0_buffer: wp.array3d(dtype=Any),
-            f1_buffer: wp.array3d(dtype=Any),
-            missing_mask: wp.array3d(dtype=Any),
-            index: Any,
-        ):
-            # Read thread data for populations and missing mask
-            _f0_thread = _f_vec()
-            _f1_thread = _f_vec()
-            _missing_mask = _missing_mask_vec()
-            for l in range(self.velocity_set.q):
-                _f0_thread[l] = self.compute_dtype(f0_buffer[l, index[0], index[1]])
-                _f1_thread[l] = self.compute_dtype(f1_buffer[l, index[0], index[1]])
-                if missing_mask[l, index[0], index[1]]:
-                    _missing_mask[l] = wp.uint8(1)
-                else:
-                    _missing_mask[l] = wp.uint8(0)
-            return _f0_thread, _f1_thread, _missing_mask
-
-        @wp.func
-        def get_thread_data_3d(
+        def get_thread_data(
             f0_buffer: wp.array4d(dtype=Any),
             f1_buffer: wp.array4d(dtype=Any),
             missing_mask: wp.array4d(dtype=Any),
@@ -182,47 +162,7 @@ def get_thread_data_3d(
             return _f0_thread, _f1_thread, _missing_mask
 
         @wp.kernel
-        def kernel2d(
-            f_0: wp.array3d(dtype=Any),
-            f_1: wp.array3d(dtype=Any),
-            bc_mask: wp.array3d(dtype=Any),
-            missing_mask: wp.array3d(dtype=Any),
-            timestep: int,
-        ):
-            i, j = wp.tid()
-            index = wp.vec2i(i, j)
-
-            _boundary_id = bc_mask[0, index[0], index[1]]
-            if _boundary_id == wp.uint8(255):
-                return
-
-            # Apply streaming
-            _f_post_stream = self.stream.warp_functional(f_0, index)
-
-            _f0_thread, _f1_thread, _missing_mask = get_thread_data_2d(f_0, f_1, missing_mask, index)
-            _f_post_collision = _f0_thread
-
-            # Apply post-streaming boundary conditions
-            _f_post_stream = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0, f_1, _f_post_collision, _f_post_stream, True)
-
-            # Compute rho and u
-            _rho, _u = self.macroscopic.warp_functional(_f_post_stream)
-
-            # Compute equilibrium
-            _feq = self.equilibrium.warp_functional(_rho, _u)
-
-            # Apply collision
-            _f_post_collision = self.collision.warp_functional(_f_post_stream, _feq, _rho, _u)
-
-            # Apply post-collision boundary conditions
-            _f_post_collision = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0, f_1, _f_post_stream, _f_post_collision, False)
-
-            # Store the result in f_1
-            for l in range(self.velocity_set.q):
-                f_1[l, index[0], index[1]] = self.store_dtype(_f_post_collision[l])
-
-        @wp.kernel
-        def kernel3d(
+        def kernel(
             f_0: wp.array4d(dtype=Any),
             f_1: wp.array4d(dtype=Any),
             bc_mask: wp.array4d(dtype=Any),
@@ -239,7 +179,7 @@ def kernel3d(
             # Apply streaming
             _f_post_stream = self.stream.warp_functional(f_0, index)
 
-            _f0_thread, _f1_thread, _missing_mask = get_thread_data_3d(f_0, f_1, missing_mask, index)
+            _f0_thread, _f1_thread, _missing_mask = get_thread_data(f_0, f_1, missing_mask, index)
             _f_post_collision = _f0_thread
 
             # Apply post-streaming boundary conditions
@@ -261,9 +201,6 @@ def kernel3d(
                             f_0[_opp_indices[l], index[0], index[1], index[2]] = self.store_dtype(_f1_thread[_opp_indices[l]])
                 f_1[l, index[0], index[1], index[2]] = self.store_dtype(_f_post_collision[l])
 
-        # Return the correct kernel
-        kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
-
         return None, kernel
 
     @Operator.register_backend(ComputeBackend.WARP)
diff --git a/xlb/operator/stream/stream.py b/xlb/operator/stream/stream.py
index dc2417a..247fa5a 100644
--- a/xlb/operator/stream/stream.py
+++ b/xlb/operator/stream/stream.py
@@ -55,50 +55,9 @@ def _construct_warp(self):
         _c = self.velocity_set.c
         _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
 
-        # Construct the warp functional
-        @wp.func
-        def functional2d(
-            f: wp.array3d(dtype=Any),
-            index: Any,
-        ):
-            # Pull the distribution function
-            _f = _f_vec()
-            for l in range(self.velocity_set.q):
-                # Get pull index
-                pull_index = type(index)()
-                for d in range(self.velocity_set.d):
-                    pull_index[d] = index[d] - _c[d, l]
-
-                    # impose periodicity for out of bound values
-                    if pull_index[d] < 0:
-                        pull_index[d] = f.shape[d + 1] - 1
-                    elif pull_index[d] >= f.shape[d + 1]:
-                        pull_index[d] = 0
-
-                # Read the distribution function
-                _f[l] = self.compute_dtype(f[l, pull_index[0], pull_index[1]])
-
-            return _f
-
-        @wp.kernel
-        def kernel2d(
-            f_0: wp.array3d(dtype=Any),
-            f_1: wp.array3d(dtype=Any),
-        ):
-            # Get the global index
-            i, j = wp.tid()
-            index = wp.vec2i(i, j)
-
-            # Set the output
-            _f = functional2d(f_0, index)
-
-            # Write the output
-            for l in range(self.velocity_set.q):
-                f_1[l, index[0], index[1]] = self.store_dtype(_f[l])
-
         # Construct the funcional to get streamed indices
         @wp.func
-        def functional3d(
+        def functional(
             f: wp.array4d(dtype=Any),
             index: Any,
         ):
@@ -124,7 +83,7 @@ def functional3d(
 
         # Construct the warp kernel
         @wp.kernel
-        def kernel3d(
+        def kernel(
             f_0: wp.array4d(dtype=Any),
             f_1: wp.array4d(dtype=Any),
         ):
@@ -133,15 +92,12 @@ def kernel3d(
             index = wp.vec3i(i, j, k)
 
             # Set the output
-            _f = functional3d(f_0, index)
+            _f = functional(f_0, index)
 
             # Write the output
             for l in range(self.velocity_set.q):
                 f_1[l, index[0], index[1], index[2]] = self.store_dtype(_f[l])
 
-        functional = functional3d if self.velocity_set.d == 3 else functional2d
-        kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
-
         return functional, kernel
 
     @Operator.register_backend(ComputeBackend.WARP)