From de67c099e5c224186e828495ca7d8de4a8e31e0b Mon Sep 17 00:00:00 2001
From: Mehdi Ataei <mehdi.ataei@autodesk.com>
Date: Fri, 18 Oct 2024 17:53:22 -0400
Subject: [PATCH 1/4] merged 2D and 3D Warp kernels

---
 examples/cfd/lid_driven_cavity_2d.py          |   3 +-
 requirements.txt                              |  25 ++-
 tests/grids/test_grid_warp.py                 |   7 +-
 tests/kernels/stream/test_stream_warp.py      |   7 +-
 xlb/grid/warp_grid.py                         |   4 +-
 .../boundary_condition/bc_do_nothing.py       |  65 +-------
 .../boundary_condition/bc_equilibrium.py      |  66 +-------
 .../bc_extrapolation_outflow.py               | 121 +-------------
 .../bc_fullway_bounce_back.py                 |  64 +------
 .../bc_grads_approximation.py                 |  41 +----
 .../bc_halfway_bounce_back.py                 |  65 +-------
 .../boundary_condition/bc_regularized.py      | 156 ++----------------
 xlb/operator/boundary_condition/bc_zouhe.py   | 150 ++---------------
 .../boundary_condition/boundary_condition.py  |  30 +---
 .../indices_boundary_masker.py                | 137 ++++++---------
 xlb/operator/collision/bgk.py                 |  31 +---
 xlb/operator/collision/forced_collision.py    |  36 +---
 xlb/operator/collision/kbc.py                 |  62 +------
 .../equilibrium/quadratic_equilibrium.py      |  25 +--
 xlb/operator/force/exact_difference_force.py  |  29 +---
 xlb/operator/force/momentum_transfer.py       |  60 +------
 xlb/operator/macroscopic/first_moment.py      |  22 +--
 xlb/operator/macroscopic/macroscopic.py       |  22 +--
 xlb/operator/macroscopic/second_moment.py     |  23 +--
 xlb/operator/macroscopic/zero_moment.py       |  19 +--
 xlb/operator/stepper/nse_stepper.py           |  69 +-------
 xlb/operator/stream/stream.py                 |  50 +-----
 27 files changed, 127 insertions(+), 1262 deletions(-)

diff --git a/examples/cfd/lid_driven_cavity_2d.py b/examples/cfd/lid_driven_cavity_2d.py
index 20f3b7c..383a110 100644
--- a/examples/cfd/lid_driven_cavity_2d.py
+++ b/examples/cfd/lid_driven_cavity_2d.py
@@ -77,7 +77,8 @@ def run(self, num_steps, post_process_interval=100):
     def post_process(self, i):
         # Write the results. We'll use JAX backend for the post-processing
         if not isinstance(self.f_0, jnp.ndarray):
-            f_0 = wp.to_jax(self.f_0)
+            # If the backend is warp, we need to drop the last dimension added by warp for 2D simulations
+            f_0 = wp.to_jax(self.f_0)[..., 0]
         else:
             f_0 = self.f_0
 
diff --git a/requirements.txt b/requirements.txt
index ee107af..4d0cd2c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,15 +1,10 @@
-jax==0.4.20
-jaxlib==0.4.20
-matplotlib==3.8.0
-numpy==1.26.1
-pyvista==0.43.4
-Rtree==1.0.1
-trimesh==4.4.1
-orbax-checkpoint==0.4.1
-termcolor==2.3.0
-PhantomGaze @ git+https://github.com/loliverhennigh/PhantomGaze.git
-tqdm==4.66.2
-warp-lang==1.0.2
-numpy-stl==3.1.1
-pydantic==2.7.0
-ruff==0.5.6
\ No newline at end of file
+jax[cuda]
+matplotlib
+numpy
+pyvista
+Rtree
+trimesh
+warp-lang
+numpy-stl
+pydantic
+ruff
\ No newline at end of file
diff --git a/tests/grids/test_grid_warp.py b/tests/grids/test_grid_warp.py
index 782434d..61b27d4 100644
--- a/tests/grids/test_grid_warp.py
+++ b/tests/grids/test_grid_warp.py
@@ -22,8 +22,10 @@ def test_warp_grid_create_field(grid_size):
         init_xlb_env(xlb.velocity_set.D3Q19)
         my_grid = grid_factory(grid_shape)
         f = my_grid.create_field(cardinality=9, dtype=Precision.FP32)
-
-        assert f.shape == (9,) + grid_shape, "Field shape is incorrect"
+        if len(grid_shape) == 2:
+            assert f.shape == (9,) + grid_shape + (1,), "Field shape is incorrect got {}".format(f.shape)
+        else:
+            assert f.shape == (9,) + grid_shape, "Field shape is incorrect got {}".format(f.shape)
         assert isinstance(f, wp.array), "Field should be a Warp ndarray"
 
 
@@ -37,7 +39,6 @@ def test_warp_grid_create_field_fill_value():
     assert isinstance(f, wp.array), "Field should be a Warp ndarray"
 
     f = f.numpy()
-    assert f.shape == (9,) + grid_shape, "Field shape is incorrect"
     assert np.allclose(f, fill_value), "Field not properly initialized with fill_value"
 
 
diff --git a/tests/kernels/stream/test_stream_warp.py b/tests/kernels/stream/test_stream_warp.py
index 0d100cf..95fcc05 100644
--- a/tests/kernels/stream/test_stream_warp.py
+++ b/tests/kernels/stream/test_stream_warp.py
@@ -61,7 +61,7 @@ def test_stream_operator_warp(dim, velocity_set, grid_shape):
     expected = jnp.stack(expected, axis=0)
 
     if dim == 2:
-        f_initial_warp = wp.array(f_initial)
+        f_initial_warp = wp.array(f_initial[..., np.newaxis])
 
     elif dim == 3:
         f_initial_warp = wp.array(f_initial)
@@ -71,7 +71,10 @@ def test_stream_operator_warp(dim, velocity_set, grid_shape):
     f_streamed = my_grid_warp.create_field(cardinality=velocity_set.q)
     f_streamed = stream_op(f_initial_warp, f_streamed)
 
-    assert jnp.allclose(f_streamed.numpy(), np.array(expected)), "Streaming did not occur as expected"
+    if len(grid_shape) == 2:
+        assert jnp.allclose(f_streamed.numpy()[..., 0], np.array(expected)), "Streaming did not occur as expected"
+    else:
+        assert jnp.allclose(f_streamed.numpy(), np.array(expected)), "Streaming did not occur as expected"
 
 
 if __name__ == "__main__":
diff --git a/xlb/grid/warp_grid.py b/xlb/grid/warp_grid.py
index 5018962..c74fc2f 100644
--- a/xlb/grid/warp_grid.py
+++ b/xlb/grid/warp_grid.py
@@ -21,7 +21,9 @@ def create_field(
         fill_value=None,
     ):
         dtype = dtype.wp_dtype if dtype else DefaultConfig.default_precision_policy.store_precision.wp_dtype
-        shape = (cardinality,) + (self.shape)
+
+        # Check if shape is 2D, and if so, append a singleton dimension to the shape
+        shape = (cardinality,) + (self.shape if len(self.shape) != 2 else self.shape + (1,))
 
         if fill_value is None:
             f = wp.zeros(shape, dtype=dtype)
diff --git a/xlb/operator/boundary_condition/bc_do_nothing.py b/xlb/operator/boundary_condition/bc_do_nothing.py
index 4b7ac90..67b343d 100644
--- a/xlb/operator/boundary_condition/bc_do_nothing.py
+++ b/xlb/operator/boundary_condition/bc_do_nothing.py
@@ -64,67 +64,4 @@ def functional(
         ):
             return f_pre
 
-        @wp.kernel
-        def kernel2d(
-            f_pre: wp.array3d(dtype=Any),
-            f_post: wp.array3d(dtype=Any),
-            bc_mask: wp.array3d(dtype=wp.uint8),
-            missing_mask: wp.array3d(dtype=wp.uint8),
-        ):
-            # Get the global index
-            i, j = wp.tid()
-            index = wp.vec2i(i, j)
-
-            # read tid data
-            _f_pre, _f_post, _boundary_id, _missing_mask = self._get_thread_data_2d(f_pre, f_post, bc_mask, missing_mask, index)
-
-            # Apply the boundary condition
-            if _boundary_id == wp.uint8(DoNothingBC.id):
-                timestep = 0
-                _f = functional(index, timestep, _missing_mask, f_pre, f_post, _f_pre, _f_post)
-            else:
-                _f = _f_post
-
-            # Write the result
-            for l in range(self.velocity_set.q):
-                f_post[l, index[0], index[1]] = self.store_dtype(_f[l])
-
-        # Construct the warp kernel
-        @wp.kernel
-        def kernel3d(
-            f_pre: wp.array4d(dtype=Any),
-            f_post: wp.array4d(dtype=Any),
-            bc_mask: wp.array4d(dtype=wp.uint8),
-            missing_mask: wp.array4d(dtype=wp.bool),
-        ):
-            # Get the global index
-            i, j, k = wp.tid()
-            index = wp.vec3i(i, j, k)
-
-            # read tid data
-            _f_pre, _f_post, _boundary_id, _missing_mask = self._get_thread_data_3d(f_pre, f_post, bc_mask, missing_mask, index)
-
-            # Apply the boundary condition
-            if _boundary_id == wp.uint8(DoNothingBC.id):
-                timestep = 0
-                _f = functional(index, timestep, _missing_mask, f_pre, f_post, _f_pre, _f_post)
-            else:
-                _f = _f_post
-
-            # Write the result
-            for l in range(self.velocity_set.q):
-                f_post[l, index[0], index[1], index[2]] = self.store_dtype(_f[l])
-
-        kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
-
-        return functional, kernel
-
-    @Operator.register_backend(ComputeBackend.WARP)
-    def warp_implementation(self, f_pre, f_post, bc_mask, missing_mask):
-        # Launch the warp kernel
-        wp.launch(
-            self.warp_kernel,
-            inputs=[f_pre, f_post, bc_mask, missing_mask],
-            dim=f_pre.shape[1:],
-        )
-        return f_post
+        return functional, None
diff --git a/xlb/operator/boundary_condition/bc_equilibrium.py b/xlb/operator/boundary_condition/bc_equilibrium.py
index 4dd4b9e..b4b957a 100644
--- a/xlb/operator/boundary_condition/bc_equilibrium.py
+++ b/xlb/operator/boundary_condition/bc_equilibrium.py
@@ -88,68 +88,4 @@ def functional(
             _f = self.equilibrium_operator.warp_functional(_rho, _u)
             return _f
 
-        # Construct the warp kernel
-        @wp.kernel
-        def kernel2d(
-            f_pre: wp.array3d(dtype=Any),
-            f_post: wp.array3d(dtype=Any),
-            bc_mask: wp.array3d(dtype=wp.uint8),
-            missing_mask: wp.array3d(dtype=wp.bool),
-        ):
-            # Get the global index
-            i, j = wp.tid()
-            index = wp.vec2i(i, j)
-
-            # read tid data
-            _f_pre, _f_post, _boundary_id, _missing_mask = self._get_thread_data_2d(f_pre, f_post, bc_mask, missing_mask, index)
-
-            # Apply the boundary condition
-            if _boundary_id == wp.uint8(EquilibriumBC.id):
-                timestep = 0
-                _f = functional(index, timestep, _missing_mask, f_pre, f_post, _f_pre, _f_post)
-            else:
-                _f = _f_post
-
-            # Write the result
-            for l in range(self.velocity_set.q):
-                f_post[l, index[0], index[1]] = self.store_dtype(_f[l])
-
-        # Construct the warp kernel
-        @wp.kernel
-        def kernel3d(
-            f_pre: wp.array4d(dtype=Any),
-            f_post: wp.array4d(dtype=Any),
-            bc_mask: wp.array4d(dtype=wp.uint8),
-            missing_mask: wp.array4d(dtype=wp.bool),
-        ):
-            # Get the global index
-            i, j, k = wp.tid()
-            index = wp.vec3i(i, j, k)
-
-            # read tid data
-            _f_pre, _f_post, _boundary_id, _missing_mask = self._get_thread_data_3d(f_pre, f_post, bc_mask, missing_mask, index)
-
-            # Apply the boundary condition
-            if _boundary_id == wp.uint8(EquilibriumBC.id):
-                timestep = 0
-                _f = functional(index, timestep, _missing_mask, f_pre, f_post, _f_pre, _f_post)
-            else:
-                _f = _f_post
-
-            # Write the result
-            for l in range(self.velocity_set.q):
-                f_post[l, index[0], index[1], index[2]] = self.store_dtype(_f[l])
-
-        kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
-
-        return functional, kernel
-
-    @Operator.register_backend(ComputeBackend.WARP)
-    def warp_implementation(self, f_pre, f_post, bc_mask, missing_mask):
-        # Launch the warp kernel
-        wp.launch(
-            self.warp_kernel,
-            inputs=[f_pre, f_post, bc_mask, missing_mask],
-            dim=f_pre.shape[1:],
-        )
-        return f_post
+        return functional, None
diff --git a/xlb/operator/boundary_condition/bc_extrapolation_outflow.py b/xlb/operator/boundary_condition/bc_extrapolation_outflow.py
index 53645c6..4a96c73 100644
--- a/xlb/operator/boundary_condition/bc_extrapolation_outflow.py
+++ b/xlb/operator/boundary_condition/bc_extrapolation_outflow.py
@@ -140,15 +140,7 @@ def _construct_warp(self):
         _opp_indices = self.velocity_set.opp_indices
 
         @wp.func
-        def get_normal_vectors_2d(
-            missing_mask: Any,
-        ):
-            for l in range(_q):
-                if missing_mask[l] == wp.uint8(1) and wp.abs(_c[0, l]) + wp.abs(_c[1, l]) == 1:
-                    return -wp.vec2i(_c[0, l], _c[1, l])
-
-        @wp.func
-        def get_normal_vectors_3d(
+        def get_normal_vectors(
             missing_mask: Any,
         ):
             for l in range(_q):
@@ -175,7 +167,7 @@ def functional(
             return _f
 
         @wp.func
-        def prepare_bc_auxilary_data_2d(
+        def prepare_bc_auxilary_data(
             index: Any,
             timestep: Any,
             missing_mask: Any,
@@ -188,34 +180,7 @@ def prepare_bc_auxilary_data_2d(
             # f_pre (post-streaming values of the current voxel). We use directions that leave the domain
             # for storing this prepared data.
             _f = f_post
-            nv = get_normal_vectors_2d(missing_mask)
-            for l in range(self.velocity_set.q):
-                if missing_mask[l] == wp.uint8(1):
-                    # f_0 is the post-collision values of the current time-step
-                    # Get pull index associated with the "neighbours" pull_index
-                    pull_index = type(index)()
-                    for d in range(self.velocity_set.d):
-                        pull_index[d] = index[d] - (_c[d, l] + nv[d])
-                    # The following is the post-streaming values of the neighbor cell
-                    f_aux = self.compute_dtype(f_0[l, pull_index[0], pull_index[1]])
-                    _f[_opp_indices[l]] = (self.compute_dtype(1.0) - sound_speed) * f_pre[l] + sound_speed * f_aux
-            return _f
-
-        @wp.func
-        def prepare_bc_auxilary_data_3d(
-            index: Any,
-            timestep: Any,
-            missing_mask: Any,
-            f_0: Any,
-            f_1: Any,
-            f_pre: Any,
-            f_post: Any,
-        ):
-            # Preparing the formulation for this BC using the neighbour's populations stored in f_aux and
-            # f_pre (post-streaming values of the current voxel). We use directions that leave the domain
-            # for storing this prepared data.
-            _f = f_post
-            nv = get_normal_vectors_3d(missing_mask)
+            nv = get_normal_vectors(missing_mask)
             for l in range(self.velocity_set.q):
                 if missing_mask[l] == wp.uint8(1):
                     # f_0 is the post-collision values of the current time-step
@@ -228,82 +193,4 @@ def prepare_bc_auxilary_data_3d(
                     _f[_opp_indices[l]] = (self.compute_dtype(1.0) - sound_speed) * f_pre[l] + sound_speed * f_aux
             return _f
 
-        # Construct the warp kernel
-        @wp.kernel
-        def kernel2d(
-            f_pre: wp.array3d(dtype=Any),
-            f_post: wp.array3d(dtype=Any),
-            bc_mask: wp.array3d(dtype=wp.uint8),
-            missing_mask: wp.array3d(dtype=wp.bool),
-        ):
-            # Get the global index
-            i, j = wp.tid()
-            index = wp.vec2i(i, j)
-            timestep = 0
-
-            # read tid data
-            _f_pre, _f_post, _boundary_id, _missing_mask = self._get_thread_data_2d(f_pre, f_post, bc_mask, missing_mask, index)
-
-            # special preparation of auxiliary data
-            if _boundary_id == wp.uint8(ExtrapolationOutflowBC.id):
-                _f_pre = prepare_bc_auxilary_data_2d(index, timestep, missing_mask, f_pre, f_post, f_pre, f_post)
-
-            # Apply the boundary condition
-            if _boundary_id == wp.uint8(ExtrapolationOutflowBC.id):
-                # TODO: is there any way for this BC to have a meaningful kernel given that it has two steps after both
-                # collision and streaming?
-                _f = functional(index, timestep, _missing_mask, f_pre, f_post, _f_pre, _f_post)
-            else:
-                _f = _f_post
-
-            # Write the distribution function
-            for l in range(self.velocity_set.q):
-                f_post[l, index[0], index[1]] = self.store_dtype(_f[l])
-
-        # Construct the warp kernel
-        @wp.kernel
-        def kernel3d(
-            f_pre: wp.array4d(dtype=Any),
-            f_post: wp.array4d(dtype=Any),
-            bc_mask: wp.array4d(dtype=wp.uint8),
-            missing_mask: wp.array4d(dtype=wp.bool),
-        ):
-            # Get the global index
-            i, j, k = wp.tid()
-            index = wp.vec3i(i, j, k)
-            timestep = 0
-
-            # read tid data
-            _f_pre, _f_post, _boundary_id, _missing_mask = self._get_thread_data_3d(f_pre, f_post, bc_mask, missing_mask, index)
-            _f_aux = _f_vec()
-
-            # special preparation of auxiliary data
-            if _boundary_id == wp.uint8(ExtrapolationOutflowBC.id):
-                _f_pre = prepare_bc_auxilary_data_3d(index, timestep, missing_mask, f_pre, f_post, f_pre, f_post)
-
-            # Apply the boundary condition
-            if _boundary_id == wp.uint8(ExtrapolationOutflowBC.id):
-                # TODO: is there any way for this BC to have a meaninful kernel given that it has two steps after both
-                # collision and streaming?
-                _f = functional(index, timestep, _missing_mask, f_pre, f_post, _f_pre, _f_post)
-            else:
-                _f = _f_post
-
-            # Write the distribution function
-            for l in range(self.velocity_set.q):
-                f_post[l, index[0], index[1], index[2]] = self.store_dtype(_f[l])
-
-        kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
-        prepare_bc_auxilary_data = prepare_bc_auxilary_data_3d if self.velocity_set.d == 3 else prepare_bc_auxilary_data_2d
-
-        return (functional, prepare_bc_auxilary_data), kernel
-
-    @Operator.register_backend(ComputeBackend.WARP)
-    def warp_implementation(self, f_pre, f_post, bc_mask, missing_mask):
-        # Launch the warp kernel
-        wp.launch(
-            self.warp_kernel,
-            inputs=[f_pre, f_post, bc_mask, missing_mask],
-            dim=f_pre.shape[1:],
-        )
-        return f_post
+        return (functional, prepare_bc_auxilary_data), None
\ No newline at end of file
diff --git a/xlb/operator/boundary_condition/bc_fullway_bounce_back.py b/xlb/operator/boundary_condition/bc_fullway_bounce_back.py
index 8569e84..afe05de 100644
--- a/xlb/operator/boundary_condition/bc_fullway_bounce_back.py
+++ b/xlb/operator/boundary_condition/bc_fullway_bounce_back.py
@@ -74,66 +74,4 @@ def functional(
                 fliped_f[l] = f_pre[_opp_indices[l]]
             return fliped_f
 
-        @wp.kernel
-        def kernel2d(
-            f_pre: wp.array3d(dtype=Any),
-            f_post: wp.array3d(dtype=Any),
-            bc_mask: wp.array3d(dtype=wp.uint8),
-            missing_mask: wp.array3d(dtype=wp.bool),
-        ):  # Get the global index
-            i, j = wp.tid()
-            index = wp.vec2i(i, j)
-
-            # read tid data
-            _f_pre, _f_post, _boundary_id, _missing_mask = self._get_thread_data_2d(f_pre, f_post, bc_mask, missing_mask, index)
-
-            # Check if the boundary is active
-            if _boundary_id == wp.uint8(FullwayBounceBackBC.id):
-                timestep = 0
-                _f = functional(index, timestep, _missing_mask, f_pre, f_post, _f_pre, _f_post)
-            else:
-                _f = _f_post
-
-            # Write the result to the output
-            for l in range(self.velocity_set.q):
-                f_post[l, index[0], index[1]] = self.store_dtype(_f[l])
-
-        # Construct the warp kernel
-        @wp.kernel
-        def kernel3d(
-            f_pre: wp.array4d(dtype=Any),
-            f_post: wp.array4d(dtype=Any),
-            bc_mask: wp.array4d(dtype=wp.uint8),
-            missing_mask: wp.array4d(dtype=wp.bool),
-        ):
-            # Get the global index
-            i, j, k = wp.tid()
-            index = wp.vec3i(i, j, k)
-
-            # read tid data
-            _f_pre, _f_post, _boundary_id, _missing_mask = self._get_thread_data_3d(f_pre, f_post, bc_mask, missing_mask, index)
-
-            # Check if the boundary is active
-            if _boundary_id == wp.uint8(FullwayBounceBackBC.id):
-                timestep = 0
-                _f = functional(index, timestep, _missing_mask, f_pre, f_post, _f_pre, _f_post)
-            else:
-                _f = _f_post
-
-            # Write the result to the output
-            for l in range(self.velocity_set.q):
-                f_post[l, index[0], index[1], index[2]] = self.store_dtype(_f[l])
-
-        kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
-
-        return functional, kernel
-
-    @Operator.register_backend(ComputeBackend.WARP)
-    def warp_implementation(self, f_pre, f_post, bc_mask, missing_mask):
-        # Launch the warp kernel
-        wp.launch(
-            self.warp_kernel,
-            inputs=[f_pre, f_post, bc_mask, missing_mask],
-            dim=f_pre.shape[1:],
-        )
-        return f_post
+        return functional, None
\ No newline at end of file
diff --git a/xlb/operator/boundary_condition/bc_grads_approximation.py b/xlb/operator/boundary_condition/bc_grads_approximation.py
index 3d60879..f5dc343 100644
--- a/xlb/operator/boundary_condition/bc_grads_approximation.py
+++ b/xlb/operator/boundary_condition/bc_grads_approximation.py
@@ -308,45 +308,6 @@ def functional_method2(
             f_post = grads_approximate_fpop(missing_mask, rho_target, u_target, f_post)
             return f_post
 
-        # Construct the warp kernel
-        @wp.kernel
-        def kernel(
-            f_pre: wp.array4d(dtype=Any),
-            f_post: wp.array4d(dtype=Any),
-            bc_mask: wp.array4d(dtype=wp.uint8),
-            missing_mask: wp.array4d(dtype=wp.bool),
-        ):
-            # Get the global index
-            i, j, k = wp.tid()
-            index = wp.vec3i(i, j, k)
-            timestep = 0
-
-            # read tid data
-            _f_pre, _f_post, _boundary_id, _missing_mask = self._get_thread_data_3d(f_pre, f_post, bc_mask, missing_mask, index)
-            _f_aux = _f_vec()
-
-            # Apply the boundary condition
-            if _boundary_id == wp.uint8(GradsApproximationBC.id):
-                # TODO: is there any way for this BC to have a meaninful kernel given that it has two steps after both
-                # collision and streaming?
-                _f = functional(index, timestep, _missing_mask, f_pre, f_post, _f_pre, _f_post)
-            else:
-                _f = _f_post
-
-            # Write the distribution function
-            for l in range(self.velocity_set.q):
-                f_post[l, index[0], index[1], index[2]] = self.store_dtype(_f[l])
-
         functional = functional_method1
 
-        return functional, kernel
-
-    @Operator.register_backend(ComputeBackend.WARP)
-    def warp_implementation(self, f_pre, f_post, bc_mask, missing_mask):
-        # Launch the warp kernel
-        wp.launch(
-            self.warp_kernel,
-            inputs=[f_pre, f_post, bc_mask, missing_mask],
-            dim=f_pre.shape[1:],
-        )
-        return f_post
+        return functional, None
\ No newline at end of file
diff --git a/xlb/operator/boundary_condition/bc_halfway_bounce_back.py b/xlb/operator/boundary_condition/bc_halfway_bounce_back.py
index e8df6b7..ee68b50 100644
--- a/xlb/operator/boundary_condition/bc_halfway_bounce_back.py
+++ b/xlb/operator/boundary_condition/bc_halfway_bounce_back.py
@@ -87,68 +87,5 @@ def functional(
 
             return _f
 
-        # Construct the warp kernel
-        @wp.kernel
-        def kernel2d(
-            f_pre: wp.array3d(dtype=Any),
-            f_post: wp.array3d(dtype=Any),
-            bc_mask: wp.array3d(dtype=wp.uint8),
-            missing_mask: wp.array3d(dtype=wp.bool),
-        ):
-            # Get the global index
-            i, j = wp.tid()
-            index = wp.vec2i(i, j)
-
-            # read tid data
-            _f_pre, _f_post, _boundary_id, _missing_mask = self._get_thread_data_2d(f_pre, f_post, bc_mask, missing_mask, index)
-
-            # Apply the boundary condition
-            if _boundary_id == wp.uint8(HalfwayBounceBackBC.id):
-                timestep = 0
-                _f = functional(index, timestep, _missing_mask, f_pre, f_post, _f_pre, _f_post)
-            else:
-                _f = _f_post
-
-            # Write the distribution function
-            for l in range(self.velocity_set.q):
-                f_post[l, index[0], index[1]] = self.store_dtype(_f[l])
-
-        # Construct the warp kernel
-        @wp.kernel
-        def kernel3d(
-            f_pre: wp.array4d(dtype=Any),
-            f_post: wp.array4d(dtype=Any),
-            bc_mask: wp.array4d(dtype=wp.uint8),
-            missing_mask: wp.array4d(dtype=wp.bool),
-        ):
-            # Get the global index
-            i, j, k = wp.tid()
-            index = wp.vec3i(i, j, k)
+        return functional, None
 
-            # read tid data
-            _f_pre, _f_post, _boundary_id, _missing_mask = self._get_thread_data_3d(f_pre, f_post, bc_mask, missing_mask, index)
-
-            # Apply the boundary condition
-            if _boundary_id == wp.uint8(HalfwayBounceBackBC.id):
-                timestep = 0
-                _f = functional(index, timestep, _missing_mask, f_pre, f_post, _f_pre, _f_post)
-            else:
-                _f = _f_post
-
-            # Write the distribution function
-            for l in range(self.velocity_set.q):
-                f_post[l, index[0], index[1], index[2]] = self.store_dtype(_f[l])
-
-        kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
-
-        return functional, kernel
-
-    @Operator.register_backend(ComputeBackend.WARP)
-    def warp_implementation(self, f_pre, f_post, bc_mask, missing_mask):
-        # Launch the warp kernel
-        wp.launch(
-            self.warp_kernel,
-            inputs=[f_pre, f_post, bc_mask, missing_mask],
-            dim=f_pre.shape[1:],
-        )
-        return f_post
diff --git a/xlb/operator/boundary_condition/bc_regularized.py b/xlb/operator/boundary_condition/bc_regularized.py
index 065a0b0..12622e2 100644
--- a/xlb/operator/boundary_condition/bc_regularized.py
+++ b/xlb/operator/boundary_condition/bc_regularized.py
@@ -159,15 +159,7 @@ def _get_fsum(
             return fsum_known + fsum_middle
 
         @wp.func
-        def get_normal_vectors_2d(
-            missing_mask: Any,
-        ):
-            for l in range(_q):
-                if missing_mask[l] == wp.uint8(1) and wp.abs(_c[0, l]) + wp.abs(_c[1, l]) == 1:
-                    return -_u_vec(_c_float[0, l], _c_float[1, l])
-
-        @wp.func
-        def get_normal_vectors_3d(
+        def get_normal_vectors(
             missing_mask: Any,
         ):
             for l in range(_q):
@@ -211,7 +203,7 @@ def regularize_fpop(
             return fpop
 
         @wp.func
-        def functional3d_velocity(
+        def functional_velocity(
             index: Any,
             timestep: Any,
             missing_mask: Any,
@@ -224,7 +216,7 @@ def functional3d_velocity(
             _f = f_post
 
             # Find normal vector
-            normals = get_normal_vectors_3d(missing_mask)
+            normals = get_normal_vectors(missing_mask)
 
             # calculate rho
             fsum = _get_fsum(_f, missing_mask)
@@ -242,7 +234,7 @@ def functional3d_velocity(
             return _f
 
         @wp.func
-        def functional3d_pressure(
+        def functional_pressure(
             index: Any,
             timestep: Any,
             missing_mask: Any,
@@ -255,7 +247,7 @@ def functional3d_pressure(
             _f = f_post
 
             # Find normal vector
-            normals = get_normal_vectors_3d(missing_mask)
+            normals = get_normal_vectors(missing_mask)
 
             # calculate velocity
             fsum = _get_fsum(_f, missing_mask)
@@ -270,136 +262,8 @@ def functional3d_pressure(
             _f = regularize_fpop(_f, feq)
             return _f
 
-        @wp.func
-        def functional2d_velocity(
-            index: Any,
-            timestep: Any,
-            missing_mask: Any,
-            f_0: Any,
-            f_1: Any,
-            f_pre: Any,
-            f_post: Any,
-        ):
-            # Post-streaming values are only modified at missing direction
-            _f = f_post
-
-            # Find normal vector
-            normals = get_normal_vectors_2d(missing_mask)
-
-            # calculate rho
-            fsum = _get_fsum(_f, missing_mask)
-            unormal = self.compute_dtype(0.0)
-            for d in range(_d):
-                unormal += _u[d] * normals[d]
-            _rho = fsum / (self.compute_dtype(1.0) + unormal)
-
-            # impose non-equilibrium bounceback
-            feq = self.equilibrium_operator.warp_functional(_rho, _u)
-            _f = bounceback_nonequilibrium(_f, feq, missing_mask)
-
-            # Regularize the boundary fpop
-            _f = regularize_fpop(_f, feq)
-            return _f
-
-        @wp.func
-        def functional2d_pressure(
-            index: Any,
-            timestep: Any,
-            missing_mask: Any,
-            f_0: Any,
-            f_1: Any,
-            f_pre: Any,
-            f_post: Any,
-        ):
-            # Post-streaming values are only modified at missing direction
-            _f = f_post
-
-            # Find normal vector
-            normals = get_normal_vectors_2d(missing_mask)
-
-            # calculate velocity
-            fsum = _get_fsum(_f, missing_mask)
-            unormal = -self.compute_dtype(1.0) + fsum / _rho
-            _u = unormal * normals
-
-            # impose non-equilibrium bounceback
-            feq = self.equilibrium_operator.warp_functional(_rho, _u)
-            _f = bounceback_nonequilibrium(_f, feq, missing_mask)
-
-            # Regularize the boundary fpop
-            _f = regularize_fpop(_f, feq)
-            return _f
-
-        # Construct the warp kernel
-        @wp.kernel
-        def kernel2d(
-            f_pre: wp.array3d(dtype=Any),
-            f_post: wp.array3d(dtype=Any),
-            bc_mask: wp.array3d(dtype=wp.uint8),
-            missing_mask: wp.array3d(dtype=wp.bool),
-        ):
-            # Get the global index
-            i, j = wp.tid()
-            index = wp.vec2i(i, j)
-
-            # read tid data
-            _f_pre, _f_post, _boundary_id, _missing_mask = self._get_thread_data_2d(f_pre, f_post, bc_mask, missing_mask, index)
-
-            # Apply the boundary condition
-            if _boundary_id == wp.uint8(self.id):
-                timestep = 0
-                _f = functional(index, timestep, _missing_mask, f_pre, f_post, _f_pre, _f_post)
-            else:
-                _f = _f_post
-
-            # Write the distribution function
-            for l in range(self.velocity_set.q):
-                f_post[l, index[0], index[1]] = self.store_dtype(_f[l])
-
-        # Construct the warp kernel
-        @wp.kernel
-        def kernel3d(
-            f_pre: wp.array4d(dtype=Any),
-            f_post: wp.array4d(dtype=Any),
-            bc_mask: wp.array4d(dtype=wp.uint8),
-            missing_mask: wp.array4d(dtype=wp.bool),
-        ):
-            # Get the global index
-            i, j, k = wp.tid()
-            index = wp.vec3i(i, j, k)
-
-            # read tid data
-            _f_pre, _f_post, _boundary_id, _missing_mask = self._get_thread_data_3d(f_pre, f_post, bc_mask, missing_mask, index)
-
-            # Apply the boundary condition
-            if _boundary_id == wp.uint8(self.id):
-                timestep = 0
-                _f = functional(index, timestep, _missing_mask, f_pre, f_post, _f_pre, _f_post)
-            else:
-                _f = _f_post
-
-            # Write the distribution function
-            for l in range(self.velocity_set.q):
-                f_post[l, index[0], index[1], index[2]] = self.store_dtype(_f[l])
-
-        kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
-        if self.velocity_set.d == 3 and self.bc_type == "velocity":
-            functional = functional3d_velocity
-        elif self.velocity_set.d == 3 and self.bc_type == "pressure":
-            functional = functional3d_pressure
-        elif self.bc_type == "velocity":
-            functional = functional2d_velocity
-        else:
-            functional = functional2d_pressure
-
-        return functional, kernel
-
-    @Operator.register_backend(ComputeBackend.WARP)
-    def warp_implementation(self, f_pre, f_post, bc_mask, missing_mask):
-        # Launch the warp kernel
-        wp.launch(
-            self.warp_kernel,
-            inputs=[f_pre, f_post, bc_mask, missing_mask],
-            dim=f_pre.shape[1:],
-        )
-        return f_post
+        if self.bc_type == "velocity":
+            functional = functional_velocity
+        elif self.bc_type == "pressure":
+            functional = functional_pressure
+        return functional, None
\ No newline at end of file
diff --git a/xlb/operator/boundary_condition/bc_zouhe.py b/xlb/operator/boundary_condition/bc_zouhe.py
index 4be2cf2..c5d9498 100644
--- a/xlb/operator/boundary_condition/bc_zouhe.py
+++ b/xlb/operator/boundary_condition/bc_zouhe.py
@@ -189,15 +189,6 @@ def _construct_warp(self):
         _c_float = self.velocity_set.c_float
         # TODO: this is way less than ideal. we should not be making new types
 
-        @wp.func
-        def get_normal_vectors_2d(
-            lattice_direction: Any,
-        ):
-            l = lattice_direction
-            if wp.abs(_c[0, l]) + wp.abs(_c[1, l]) == 1:
-                normals = -_u_vec(_c_float[0, l], _c_float[1, l])
-            return normals
-
         @wp.func
         def _get_fsum(
             fpop: Any,
@@ -213,7 +204,7 @@ def _get_fsum(
             return fsum_known + fsum_middle
 
         @wp.func
-        def get_normal_vectors_3d(
+        def get_normal_vectors(
             missing_mask: Any,
         ):
             for l in range(_q):
@@ -232,7 +223,7 @@ def bounceback_nonequilibrium(
             return fpop
 
         @wp.func
-        def functional3d_velocity(
+        def functional_velocity(
             index: Any,
             timestep: Any,
             missing_mask: Any,
@@ -245,7 +236,7 @@ def functional3d_velocity(
             _f = f_post
 
             # Find normal vector
-            normals = get_normal_vectors_3d(missing_mask)
+            normals = get_normal_vectors(missing_mask)
 
             # calculate rho
             fsum = _get_fsum(_f, missing_mask)
@@ -260,7 +251,7 @@ def functional3d_velocity(
             return _f
 
         @wp.func
-        def functional3d_pressure(
+        def functional_pressure(
             index: Any,
             timestep: Any,
             missing_mask: Any,
@@ -273,7 +264,7 @@ def functional3d_pressure(
             _f = f_post
 
             # Find normal vector
-            normals = get_normal_vectors_3d(missing_mask)
+            normals = get_normal_vectors(missing_mask)
 
             # calculate velocity
             fsum = _get_fsum(_f, missing_mask)
@@ -285,130 +276,11 @@ def functional3d_pressure(
             _f = bounceback_nonequilibrium(_f, feq, missing_mask)
             return _f
 
-        @wp.func
-        def functional2d_velocity(
-            index: Any,
-            timestep: Any,
-            missing_mask: Any,
-            f_0: Any,
-            f_1: Any,
-            f_pre: Any,
-            f_post: Any,
-        ):
-            # Post-streaming values are only modified at missing direction
-            _f = f_post
-
-            # Find normal vector
-            normals = get_normal_vectors_2d(missing_mask)
-
-            # calculate rho
-            fsum = _get_fsum(_f, missing_mask)
-            unormal = self.compute_dtype(0.0)
-            for d in range(_d):
-                unormal += _u[d] * normals[d]
-            _rho = fsum / (self.compute_dtype(1.0) + unormal)
-
-            # impose non-equilibrium bounceback
-            feq = self.equilibrium_operator.warp_functional(_rho, _u)
-            _f = bounceback_nonequilibrium(_f, feq, missing_mask)
-            return _f
-
-        @wp.func
-        def functional2d_pressure(
-            index: Any,
-            timestep: Any,
-            missing_mask: Any,
-            f_0: Any,
-            f_1: Any,
-            f_pre: Any,
-            f_post: Any,
-        ):
-            # Post-streaming values are only modified at missing direction
-            _f = f_post
-
-            # Find normal vector
-            normals = get_normal_vectors_2d(missing_mask)
-
-            # calculate velocity
-            fsum = _get_fsum(_f, missing_mask)
-            unormal = -self.compute_dtype(1.0) + fsum / _rho
-            _u = unormal * normals
-
-            # impose non-equilibrium bounceback
-            feq = self.equilibrium_operator.warp_functional(_rho, _u)
-            _f = bounceback_nonequilibrium(_f, feq, missing_mask)
-            return _f
-
-        # Construct the warp kernel
-        @wp.kernel
-        def kernel2d(
-            f_pre: wp.array3d(dtype=Any),
-            f_post: wp.array3d(dtype=Any),
-            bc_mask: wp.array3d(dtype=wp.uint8),
-            missing_mask: wp.array3d(dtype=wp.bool),
-        ):
-            # Get the global index
-            i, j = wp.tid()
-            index = wp.vec2i(i, j)
-
-            # read tid data
-            _f_pre, _f_post, _boundary_id, _missing_mask = self._get_thread_data_2d(f_pre, f_post, bc_mask, missing_mask, index)
-
-            # Apply the boundary condition
-            if _boundary_id == wp.uint8(self.id):
-                timestep = 0
-                _f = functional(index, timestep, _missing_mask, f_pre, f_post, _f_pre, _f_post)
-            else:
-                _f = _f_post
-
-            # Write the distribution function
-            for l in range(self.velocity_set.q):
-                f_post[l, index[0], index[1]] = self.store_dtype(_f[l])
-
-        # Construct the warp kernel
-        @wp.kernel
-        def kernel3d(
-            f_pre: wp.array4d(dtype=Any),
-            f_post: wp.array4d(dtype=Any),
-            bc_mask: wp.array4d(dtype=wp.uint8),
-            missing_mask: wp.array4d(dtype=wp.bool),
-        ):
-            # Get the global index
-            i, j, k = wp.tid()
-            index = wp.vec3i(i, j, k)
-
-            # read tid data
-            _f_pre, _f_post, _boundary_id, _missing_mask = self._get_thread_data_3d(f_pre, f_post, bc_mask, missing_mask, index)
-
-            # Apply the boundary condition
-            if _boundary_id == wp.uint8(self.id):
-                timestep = 0
-                _f = functional(index, timestep, _missing_mask, f_pre, f_post, _f_pre, _f_post)
-            else:
-                _f = _f_post
-
-            # Write the distribution function
-            for l in range(self.velocity_set.q):
-                f_post[l, index[0], index[1], index[2]] = self.store_dtype(_f[l])
-
-        kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
-        if self.velocity_set.d == 3 and self.bc_type == "velocity":
-            functional = functional3d_velocity
-        elif self.velocity_set.d == 3 and self.bc_type == "pressure":
-            functional = functional3d_pressure
+        if self.bc_type == "velocity":
+            functional = functional_velocity
+        elif self.bc_type == "pressure":
+            functional = functional_pressure
         elif self.bc_type == "velocity":
-            functional = functional2d_velocity
-        else:
-            functional = functional2d_pressure
+            functional = functional_pressure
 
-        return functional, kernel
-
-    @Operator.register_backend(ComputeBackend.WARP)
-    def warp_implementation(self, f_pre, f_post, bc_mask, missing_mask):
-        # Launch the warp kernel
-        wp.launch(
-            self.warp_kernel,
-            inputs=[f_pre, f_post, bc_mask, missing_mask],
-            dim=f_pre.shape[1:],
-        )
-        return f_post
+        return functional, None
\ No newline at end of file
diff --git a/xlb/operator/boundary_condition/boundary_condition.py b/xlb/operator/boundary_condition/boundary_condition.py
index 6d72fc0..e724b27 100644
--- a/xlb/operator/boundary_condition/boundary_condition.py
+++ b/xlb/operator/boundary_condition/boundary_condition.py
@@ -74,32 +74,7 @@ def prepare_bc_auxilary_data(
             return f_post
 
         @wp.func
-        def _get_thread_data_2d(
-            f_pre: wp.array3d(dtype=Any),
-            f_post: wp.array3d(dtype=Any),
-            bc_mask: wp.array3d(dtype=wp.uint8),
-            missing_mask: wp.array3d(dtype=wp.bool),
-            index: wp.vec2i,
-        ):
-            # Get the boundary id and missing mask
-            _f_pre = _f_vec()
-            _f_post = _f_vec()
-            _boundary_id = bc_mask[0, index[0], index[1]]
-            _missing_mask = _missing_mask_vec()
-            for l in range(self.velocity_set.q):
-                # q-sized vector of populations
-                _f_pre[l] = self.compute_dtype(f_pre[l, index[0], index[1]])
-                _f_post[l] = self.compute_dtype(f_post[l, index[0], index[1]])
-
-                # TODO fix vec bool
-                if missing_mask[l, index[0], index[1]]:
-                    _missing_mask[l] = wp.uint8(1)
-                else:
-                    _missing_mask[l] = wp.uint8(0)
-            return _f_pre, _f_post, _boundary_id, _missing_mask
-
-        @wp.func
-        def _get_thread_data_3d(
+        def _get_thread_data(
             f_pre: wp.array4d(dtype=Any),
             f_post: wp.array4d(dtype=Any),
             bc_mask: wp.array4d(dtype=wp.uint8),
@@ -125,8 +100,7 @@ def _get_thread_data_3d(
 
         # Construct some helper warp functions for getting tid data
         if self.compute_backend == ComputeBackend.WARP:
-            self._get_thread_data_2d = _get_thread_data_2d
-            self._get_thread_data_3d = _get_thread_data_3d
+            self._get_thread_data = _get_thread_data
             self.prepare_bc_auxilary_data = prepare_bc_auxilary_data
 
     @partial(jit, static_argnums=(0,), inline=True)
diff --git a/xlb/operator/boundary_masker/indices_boundary_masker.py b/xlb/operator/boundary_masker/indices_boundary_masker.py
index fdc4331..16650fe 100644
--- a/xlb/operator/boundary_masker/indices_boundary_masker.py
+++ b/xlb/operator/boundary_masker/indices_boundary_masker.py
@@ -31,18 +31,12 @@ def are_indices_in_interior(self, indices, shape):
         Check if each 2D or 3D index is inside the bounds of the domain with the given shape and not
         at its boundary.
 
-        :param indices: List of tuples, where each tuple contains indices for each dimension.
+        :param indices: Array of indices, where each column contains indices for each dimension.
         :param shape: Tuple representing the shape of the domain (nx, ny) for 2D or (nx, ny, nz) for 3D.
-        :return: List of boolean flags where each flag indicates whether the corresponding index is inside the bounds.
+        :return: Array of boolean flags where each flag indicates whether the corresponding index is inside the bounds.
         """
-        # Ensure that the number of dimensions in indices matches the domain shape
-        dim = len(shape)
-        if len(indices) != dim:
-            raise ValueError(f"Indices tuple must have {dim} dimensions to match the domain shape.")
-
-        # Check each index tuple and return a list of boolean flags
-        flags = [all(0 < idx[d] < shape[d] - 1 for d in range(dim)) for idx in np.array(indices).T]
-        return flags
+        shape_array = np.array(shape)
+        return np.all((indices > 0) & (indices < shape_array[:, np.newaxis] - 1), axis=0)
 
     @Operator.register_backend(ComputeBackend.JAX)
     # TODO HS: figure out why uncommenting the line below fails unlike other operators!
@@ -70,11 +64,12 @@ def jax_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
             assert bc.indices is not None, f"Please specify indices associated with the {bc.__class__.__name__} BC!"
             assert bc.mesh_vertices is None, f"Please use MeshBoundaryMasker operator if {bc.__class__.__name__} is imposed on a mesh (e.g. STL)!"
             id_number = bc.id
-            local_indices = np.array(bc.indices) - np.array(start_index)[:, np.newaxis]
+            bc_indices = np.array(bc.indices)
+            local_indices = bc_indices - np.array(start_index)[:, np.newaxis]
             padded_indices = local_indices + np.array(shift_tup)[:, np.newaxis]
             bmap = bmap.at[tuple(padded_indices)].set(id_number)
-            if any(self.are_indices_in_interior(bc.indices, domain_shape)) and bc.needs_padding:
-                # checking if all indices associated with this BC are in the interior of the domain (not at the boundary).
+            if any(self.are_indices_in_interior(bc_indices, domain_shape)) and bc.needs_padding:
+                # checking if all indices associated with this BC are in the interior of the domain.
                 # This flag is needed e.g. if the no-slip geometry is anywhere but at the boundaries of the computational domain.
                 if dim == 2:
                     grid_mask = grid_mask.at[:, padded_indices[0], padded_indices[1]].set(True)
@@ -103,59 +98,9 @@ def _construct_warp(self):
         _c = self.velocity_set.c
         _q = wp.constant(self.velocity_set.q)
 
-        # Construct the warp 2D kernel
-        @wp.kernel
-        def kernel2d(
-            indices: wp.array2d(dtype=wp.int32),
-            id_number: wp.array1d(dtype=wp.uint8),
-            is_interior: wp.array1d(dtype=wp.bool),
-            bc_mask: wp.array3d(dtype=wp.uint8),
-            missing_mask: wp.array3d(dtype=wp.bool),
-            start_index: wp.vec2i,
-        ):
-            # Get the index of indices
-            ii = wp.tid()
-
-            # Get local indices
-            index = wp.vec2i()
-            index[0] = indices[0, ii] - start_index[0]
-            index[1] = indices[1, ii] - start_index[1]
-
-            # Check if index is in bounds
-            if index[0] >= 0 and index[0] < missing_mask.shape[1] and index[1] >= 0 and index[1] < missing_mask.shape[2]:
-                # Stream indices
-                for l in range(_q):
-                    # Get the index of the streaming direction
-                    pull_index = wp.vec2i()
-                    push_index = wp.vec2i()
-                    for d in range(self.velocity_set.d):
-                        pull_index[d] = index[d] - _c[d, l]
-                        push_index[d] = index[d] + _c[d, l]
-
-                    # set bc_mask for all bc indices
-                    bc_mask[0, index[0], index[1]] = id_number[ii]
-
-                    # check if pull index is out of bound
-                    # These directions will have missing information after streaming
-                    if pull_index[0] < 0 or pull_index[0] >= missing_mask.shape[1] or pull_index[1] < 0 or pull_index[1] >= missing_mask.shape[2]:
-                        # Set the missing mask
-                        missing_mask[l, index[0], index[1]] = True
-
-                    # handling geometries in the interior of the computational domain
-                    elif (
-                        is_interior[ii]
-                        and push_index[0] >= 0
-                        and push_index[0] < missing_mask.shape[1]
-                        and push_index[1] >= 0
-                        and push_index[1] < missing_mask.shape[2]
-                    ):
-                        # Set the missing mask
-                        missing_mask[l, push_index[0], push_index[1]] = True
-                        bc_mask[0, push_index[0], push_index[1]] = id_number[ii]
-
         # Construct the warp 3D kernel
         @wp.kernel
-        def kernel3d(
+        def kernel(
             indices: wp.array2d(dtype=wp.int32),
             id_number: wp.array1d(dtype=wp.uint8),
             is_interior: wp.array1d(dtype=wp.bool),
@@ -220,46 +165,72 @@ def kernel3d(
                         missing_mask[l, push_index[0], push_index[1], push_index[2]] = True
                         bc_mask[0, push_index[0], push_index[1], push_index[2]] = id_number[ii]
 
-        kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
-
         return None, kernel
 
     @Operator.register_backend(ComputeBackend.WARP)
     def warp_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
-        dim = self.velocity_set.d
-        index_list = [[] for _ in range(dim)]
-        id_list = []
-        is_interior = []
+        # Pre-allocate arrays with maximum possible size
+        max_size = sum(len(bc.indices[0]) if isinstance(bc.indices, list) else bc.indices.shape[1] for bc in bclist if bc.indices is not None)
+        indices = np.zeros((3, max_size), dtype=np.int32)
+        id_numbers = np.zeros(max_size, dtype=np.uint8)
+        is_interior = np.zeros(max_size, dtype=bool)
+
+        current_index = 0
         for bc in bclist:
             assert bc.indices is not None, f'Please specify indices associated with the {bc.__class__.__name__} BC using keyword "indices"!'
             assert bc.mesh_vertices is None, f"Please use MeshBoundaryMasker operator if {bc.__class__.__name__} is imposed on a mesh (e.g. STL)!"
-            for d in range(dim):
-                index_list[d] += bc.indices[d]
-            id_list += [bc.id] * len(bc.indices[0])
-            is_interior += self.are_indices_in_interior(bc.indices, bc_mask[0].shape) if bc.needs_padding else [False] * len(bc.indices[0])
 
-            # We are done with bc.indices. Remove them from BC objects
+            bc_indices = np.asarray(bc.indices)
+            num_indices = bc_indices.shape[1]
+
+            # Ensure indices are 3D
+            if bc_indices.shape[0] == 2:
+                bc_indices = np.vstack([bc_indices, np.zeros(num_indices, dtype=int)])
+
+            # Add indices to the pre-allocated array
+            indices[:, current_index : current_index + num_indices] = bc_indices
+
+            # Set id numbers
+            id_numbers[current_index : current_index + num_indices] = bc.id
+
+            # Set is_interior flags
+            if bc.needs_padding:
+                is_interior[current_index : current_index + num_indices] = self.are_indices_in_interior(bc_indices, bc_mask[0].shape)
+            else:
+                is_interior[current_index : current_index + num_indices] = False
+
+            current_index += num_indices
+
+            # Remove indices from BC objects
             bc.__dict__.pop("indices", None)
 
-        indices = wp.array2d(index_list, dtype=wp.int32)
-        id_number = wp.array1d(id_list, dtype=wp.uint8)
-        is_interior = wp.array1d(is_interior, dtype=wp.bool)
+        # Trim arrays to actual size
+        indices = indices[:, :current_index]
+        id_numbers = id_numbers[:current_index]
+        is_interior = is_interior[:current_index]
+
+        # Convert to Warp arrays
+        wp_indices = wp.array(indices, dtype=wp.int32)
+        wp_id_numbers = wp.array(id_numbers, dtype=wp.uint8)
+        wp_is_interior = wp.array(is_interior, dtype=wp.bool)
 
         if start_index is None:
-            start_index = (0,) * dim
+            start_index = wp.vec3i(0, 0, 0)
+        else:
+            start_index = wp.vec3i(*start_index)
 
         # Launch the warp kernel
         wp.launch(
             self.warp_kernel,
+            dim=current_index,
             inputs=[
-                indices,
-                id_number,
-                is_interior,
+                wp_indices,
+                wp_id_numbers,
+                wp_is_interior,
                 bc_mask,
                 missing_mask,
                 start_index,
             ],
-            dim=indices.shape[1],
         )
 
         return bc_mask, missing_mask
diff --git a/xlb/operator/collision/bgk.py b/xlb/operator/collision/bgk.py
index 60f63ef..115ed9a 100644
--- a/xlb/operator/collision/bgk.py
+++ b/xlb/operator/collision/bgk.py
@@ -36,34 +36,7 @@ def functional(f: Any, feq: Any, rho: Any, u: Any):
 
         # Construct the warp kernel
         @wp.kernel
-        def kernel2d(
-            f: wp.array3d(dtype=Any),
-            feq: wp.array3d(dtype=Any),
-            fout: wp.array3d(dtype=Any),
-            rho: wp.array3d(dtype=Any),
-            u: wp.array3d(dtype=Any),
-        ):
-            # Get the global index
-            i, j = wp.tid()
-            index = wp.vec2i(i, j)
-
-            # Load needed values
-            _f = _f_vec()
-            _feq = _f_vec()
-            for l in range(self.velocity_set.q):
-                _f[l] = f[l, index[0], index[1]]
-                _feq[l] = feq[l, index[0], index[1]]
-
-            # Compute the collision
-            _fout = functional(_f, _feq, rho, u)
-
-            # Write the result
-            for l in range(self.velocity_set.q):
-                fout[l, index[0], index[1]] = self.store_dtype(_fout[l])
-
-        # Construct the warp kernel
-        @wp.kernel
-        def kernel3d(
+        def kernel(
             f: wp.array4d(dtype=Any),
             feq: wp.array4d(dtype=Any),
             fout: wp.array4d(dtype=Any),
@@ -88,8 +61,6 @@ def kernel3d(
             for l in range(self.velocity_set.q):
                 fout[l, index[0], index[1], index[2]] = self.store_dtype(_fout[l])
 
-        kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
-
         return functional, kernel
 
     @Operator.register_backend(ComputeBackend.WARP)
diff --git a/xlb/operator/collision/forced_collision.py b/xlb/operator/collision/forced_collision.py
index 31ef392..2036bab 100644
--- a/xlb/operator/collision/forced_collision.py
+++ b/xlb/operator/collision/forced_collision.py
@@ -52,39 +52,7 @@ def functional(f: Any, feq: Any, rho: Any, u: Any):
 
         # Construct the warp kernel
         @wp.kernel
-        def kernel2d(
-            f: wp.array3d(dtype=Any),
-            feq: wp.array3d(dtype=Any),
-            fout: wp.array3d(dtype=Any),
-            rho: wp.array3d(dtype=Any),
-            u: wp.array3d(dtype=Any),
-        ):
-            # Get the global index
-            i, j = wp.tid()
-            index = wp.vec2i(i, j)  # TODO: Warp needs to fix this
-
-            # Load needed values
-            _f = _f_vec()
-            _feq = _f_vec()
-            _d = self.velocity_set.d
-            for l in range(self.velocity_set.q):
-                _f[l] = f[l, index[0], index[1]]
-                _feq[l] = feq[l, index[0], index[1]]
-            _u = _u_vec()
-            for l in range(_d):
-                _u[l] = u[l, index[0], index[1]]
-            _rho = rho[0, index[0], index[1]]
-
-            # Compute the collision
-            _fout = functional(_f, _feq, _rho, _u)
-
-            # Write the result
-            for l in range(self.velocity_set.q):
-                fout[l, index[0], index[1]] = _fout[l]
-
-        # Construct the warp kernel
-        @wp.kernel
-        def kernel3d(
+        def kernel(
             f: wp.array4d(dtype=Any),
             feq: wp.array4d(dtype=Any),
             fout: wp.array4d(dtype=Any),
@@ -114,8 +82,6 @@ def kernel3d(
             for l in range(self.velocity_set.q):
                 fout[l, index[0], index[1], index[2]] = _fout[l]
 
-        kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
-
         return functional, kernel
 
     @Operator.register_backend(ComputeBackend.WARP)
diff --git a/xlb/operator/collision/kbc.py b/xlb/operator/collision/kbc.py
index cc2fb04..d94f5eb 100644
--- a/xlb/operator/collision/kbc.py
+++ b/xlb/operator/collision/kbc.py
@@ -263,30 +263,7 @@ def entropic_scalar_product(
 
         # Construct the functional
         @wp.func
-        def functional2d(
-            f: Any,
-            feq: Any,
-            rho: Any,
-            u: Any,
-        ):
-            # Compute shear and delta_s
-            fneq = f - feq
-            shear = decompose_shear_d2q9(fneq)
-            delta_s = shear * rho  # TODO: Check this
-
-            # Perform collision
-            delta_h = fneq - delta_s
-            two = self.compute_dtype(2.0)
-            gamma = _inv_beta - (two - _inv_beta) * entropic_scalar_product(delta_s, delta_h, feq) / (
-                _epsilon + entropic_scalar_product(delta_h, delta_h, feq)
-            )
-            fout = f - _beta * (two * delta_s + gamma * delta_h)
-
-            return fout
-
-        # Construct the functional
-        @wp.func
-        def functional3d(
+        def functional(
             f: Any,
             feq: Any,
             rho: Any,
@@ -309,39 +286,7 @@ def functional3d(
 
         # Construct the warp kernel
         @wp.kernel
-        def kernel2d(
-            f: wp.array3d(dtype=Any),
-            feq: wp.array3d(dtype=Any),
-            fout: wp.array3d(dtype=Any),
-            rho: wp.array3d(dtype=Any),
-            u: wp.array3d(dtype=Any),
-        ):
-            # Get the global index
-            i, j = wp.tid()
-            index = wp.vec2i(i, j)  # TODO: Warp needs to fix this
-
-            # Load needed values
-            _f = _f_vec()
-            _feq = _f_vec()
-            _d = self.velocity_set.d
-            for l in range(self.velocity_set.q):
-                _f[l] = f[l, index[0], index[1]]
-                _feq[l] = feq[l, index[0], index[1]]
-            _u = _u_vec()
-            for l in range(_d):
-                _u[l] = u[l, index[0], index[1]]
-            _rho = rho[0, index[0], index[1]]
-
-            # Compute the collision
-            _fout = functional(_f, _feq, _rho, _u)
-
-            # Write the result
-            for l in range(self.velocity_set.q):
-                fout[l, index[0], index[1]] = self.store_dtype(_fout[l])
-
-        # Construct the warp kernel
-        @wp.kernel
-        def kernel3d(
+        def kernel(
             f: wp.array4d(dtype=Any),
             feq: wp.array4d(dtype=Any),
             fout: wp.array4d(dtype=Any),
@@ -371,9 +316,6 @@ def kernel3d(
             for l in range(self.velocity_set.q):
                 fout[l, index[0], index[1], index[2]] = self.store_dtype(_fout[l])
 
-        functional = functional3d if self.velocity_set.d == 3 else functional2d
-        kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
-
         return functional, kernel
 
     @Operator.register_backend(ComputeBackend.WARP)
diff --git a/xlb/operator/equilibrium/quadratic_equilibrium.py b/xlb/operator/equilibrium/quadratic_equilibrium.py
index ba337f0..62cc041 100644
--- a/xlb/operator/equilibrium/quadratic_equilibrium.py
+++ b/xlb/operator/equilibrium/quadratic_equilibrium.py
@@ -61,7 +61,7 @@ def functional(
 
         # Construct the warp kernel
         @wp.kernel
-        def kernel3d(
+        def kernel(
             rho: wp.array4d(dtype=Any),
             u: wp.array4d(dtype=Any),
             f: wp.array4d(dtype=Any),
@@ -81,29 +81,6 @@ def kernel3d(
             for l in range(self.velocity_set.q):
                 f[l, index[0], index[1], index[2]] = self.store_dtype(feq[l])
 
-        @wp.kernel
-        def kernel2d(
-            rho: wp.array3d(dtype=Any),
-            u: wp.array3d(dtype=Any),
-            f: wp.array3d(dtype=Any),
-        ):
-            # Get the global index
-            i, j = wp.tid()
-            index = wp.vec2i(i, j)
-
-            # Get the equilibrium
-            _u = _u_vec()
-            for d in range(self.velocity_set.d):
-                _u[d] = u[d, index[0], index[1]]
-            _rho = rho[0, index[0], index[1]]
-            feq = functional(_rho, _u)
-
-            # Set the output
-            for l in range(self.velocity_set.q):
-                f[l, index[0], index[1]] = self.store_dtype(feq[l])
-
-        kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
-
         return functional, kernel
 
     @Operator.register_backend(ComputeBackend.WARP)
diff --git a/xlb/operator/force/exact_difference_force.py b/xlb/operator/force/exact_difference_force.py
index b4da602..ec1ef5b 100644
--- a/xlb/operator/force/exact_difference_force.py
+++ b/xlb/operator/force/exact_difference_force.py
@@ -86,33 +86,7 @@ def functional(f_postcollision: Any, feq: Any, rho: Any, u: Any):
 
         # Construct the warp kernel
         @wp.kernel
-        def kernel2d(
-            f_postcollision: Any,
-            feq: Any,
-            fout: wp.array3d(dtype=Any),
-            rho: wp.array3d(dtype=Any),
-            u: wp.array3d(dtype=Any),
-        ):
-            # Get the global index
-            i, j = wp.tid()
-            index = wp.vec2i(i, j)
-
-            # Load needed values
-            _u = _u_vec()
-            for l in range(_d):
-                _u[l] = u[l, index[0], index[1]]
-            _rho = rho[0, index[0], index[1]]
-
-            # Compute the collision
-            _fout = functional(f_postcollision, feq, _rho, _u)
-
-            # Write the result
-            for l in range(self.velocity_set.q):
-                fout[l, index[0], index[1]] = self.store_dtype(_fout[l])
-
-        # Construct the warp kernel
-        @wp.kernel
-        def kernel3d(
+        def kernel(
             f_postcollision: Any,
             feq: Any,
             fout: wp.array4d(dtype=Any),
@@ -136,7 +110,6 @@ def kernel3d(
             for l in range(self.velocity_set.q):
                 fout[l, index[0], index[1], index[2]] = self.store_dtype(_fout[l])
 
-        kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
         return functional, kernel
 
     @Operator.register_backend(ComputeBackend.WARP)
diff --git a/xlb/operator/force/momentum_transfer.py b/xlb/operator/force/momentum_transfer.py
index 8b0aacf..dbd5307 100644
--- a/xlb/operator/force/momentum_transfer.py
+++ b/xlb/operator/force/momentum_transfer.py
@@ -102,62 +102,7 @@ def _construct_warp(self):
 
         # Construct the warp kernel
         @wp.kernel
-        def kernel2d(
-            f_0: wp.array3d(dtype=Any),
-            f_1: wp.array3d(dtype=Any),
-            bc_mask: wp.array3d(dtype=wp.uint8),
-            missing_mask: wp.array3d(dtype=wp.bool),
-            force: wp.array(dtype=Any),
-        ):
-            # Get the global index
-            i, j = wp.tid()
-            index = wp.vec2i(i, j)
-
-            # Get the boundary id
-            _boundary_id = bc_mask[0, index[0], index[1]]
-            _missing_mask = _missing_mask_vec()
-            for l in range(self.velocity_set.q):
-                # TODO fix vec bool
-                if missing_mask[l, index[0], index[1]]:
-                    _missing_mask[l] = wp.uint8(1)
-                else:
-                    _missing_mask[l] = wp.uint8(0)
-
-            # Determin if boundary is an edge by checking if center is missing
-            is_edge = wp.bool(False)
-            if _boundary_id == wp.uint8(_no_slip_id):
-                if _missing_mask[_zero_index] == wp.uint8(0):
-                    is_edge = wp.bool(True)
-
-            # If the boundary is an edge then add the momentum transfer
-            m = _u_vec()
-            if is_edge:
-                # Get the distribution function
-                f_post_collision = _f_vec()
-                for l in range(self.velocity_set.q):
-                    f_post_collision[l] = f_0[l, index[0], index[1]]
-
-                # Apply streaming (pull method)
-                timestep = 0
-                f_post_stream = self.stream.warp_functional(f_0, index)
-                f_post_stream = self.no_slip_bc_instance.warp_functional(index, timestep, _missing_mask, f_0, f_1, f_post_collision, f_post_stream)
-
-                # Compute the momentum transfer
-                for d in range(self.velocity_set.d):
-                    m[d] = self.compute_dtype(0.0)
-                    for l in range(self.velocity_set.q):
-                        if _missing_mask[l] == wp.uint8(1):
-                            phi = f_post_collision[_opp_indices[l]] + f_post_stream[l]
-                            if _c[d, _opp_indices[l]] == 1:
-                                m[d] += phi
-                            elif _c[d, _opp_indices[l]] == -1:
-                                m[d] -= phi
-
-            wp.atomic_add(force, 0, m)
-
-        # Construct the warp kernel
-        @wp.kernel
-        def kernel3d(
+        def kernel(
             f_0: wp.array4d(dtype=Any),
             f_1: wp.array4d(dtype=Any),
             bc_mask: wp.array4d(dtype=wp.uint8),
@@ -210,9 +155,6 @@ def kernel3d(
 
             wp.atomic_add(force, 0, m)
 
-        # Return the correct kernel
-        kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
-
         return None, kernel
 
     @Operator.register_backend(ComputeBackend.WARP)
diff --git a/xlb/operator/macroscopic/first_moment.py b/xlb/operator/macroscopic/first_moment.py
index 329a71f..cb99a9f 100644
--- a/xlb/operator/macroscopic/first_moment.py
+++ b/xlb/operator/macroscopic/first_moment.py
@@ -38,7 +38,7 @@ def functional(
             return u
 
         @wp.kernel
-        def kernel3d(
+        def kernel(
             f: wp.array4d(dtype=Any),
             rho: wp.array4d(dtype=Any),
             u: wp.array4d(dtype=Any),
@@ -55,26 +55,6 @@ def kernel3d(
             for d in range(self.velocity_set.d):
                 u[d, index[0], index[1], index[2]] = self.store_dtype(_u[d])
 
-        @wp.kernel
-        def kernel2d(
-            f: wp.array3d(dtype=Any),
-            rho: wp.array3d(dtype=Any),
-            u: wp.array3d(dtype=Any),
-        ):
-            i, j = wp.tid()
-            index = wp.vec2i(i, j)
-
-            _f = _f_vec()
-            for l in range(self.velocity_set.q):
-                _f[l] = f[l, index[0], index[1]]
-            _rho = rho[0, index[0], index[1]]
-            _u = functional(_f, _rho)
-
-            for d in range(self.velocity_set.d):
-                u[d, index[0], index[1]] = self.store_dtype(_u[d])
-
-        kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
-
         return functional, kernel
 
     @Operator.register_backend(ComputeBackend.WARP)
diff --git a/xlb/operator/macroscopic/macroscopic.py b/xlb/operator/macroscopic/macroscopic.py
index b574436..ab1193b 100644
--- a/xlb/operator/macroscopic/macroscopic.py
+++ b/xlb/operator/macroscopic/macroscopic.py
@@ -37,7 +37,7 @@ def functional(f: _f_vec):
             return rho, u
 
         @wp.kernel
-        def kernel3d(
+        def kernel(
             f: wp.array4d(dtype=Any),
             rho: wp.array4d(dtype=Any),
             u: wp.array4d(dtype=Any),
@@ -54,26 +54,6 @@ def kernel3d(
             for d in range(self.velocity_set.d):
                 u[d, index[0], index[1], index[2]] = self.store_dtype(_u[d])
 
-        @wp.kernel
-        def kernel2d(
-            f: wp.array3d(dtype=Any),
-            rho: wp.array3d(dtype=Any),
-            u: wp.array3d(dtype=Any),
-        ):
-            i, j = wp.tid()
-            index = wp.vec2i(i, j)
-
-            _f = _f_vec()
-            for l in range(self.velocity_set.q):
-                _f[l] = f[l, index[0], index[1]]
-            _rho, _u = functional(_f)
-
-            rho[0, index[0], index[1]] = self.store_dtype(_rho)
-            for d in range(self.velocity_set.d):
-                u[d, index[0], index[1]] = self.store_dtype(_u[d])
-
-        kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
-
         return functional, kernel
 
     @Operator.register_backend(ComputeBackend.WARP)
diff --git a/xlb/operator/macroscopic/second_moment.py b/xlb/operator/macroscopic/second_moment.py
index 687b38a..6c7e70e 100644
--- a/xlb/operator/macroscopic/second_moment.py
+++ b/xlb/operator/macroscopic/second_moment.py
@@ -79,7 +79,7 @@ def functional(
 
         # Construct the kernel
         @wp.kernel
-        def kernel3d(
+        def kernel(
             f: wp.array4d(dtype=Any),
             pi: wp.array4d(dtype=Any),
         ):
@@ -97,27 +97,6 @@ def kernel3d(
             for d in range(_pi_dim):
                 pi[d, index[0], index[1], index[2]] = self.store_dtype(_pi[d])
 
-        @wp.kernel
-        def kernel2d(
-            f: wp.array3d(dtype=Any),
-            pi: wp.array3d(dtype=Any),
-        ):
-            # Get the global index
-            i, j = wp.tid()
-            index = wp.vec2i(i, j)
-
-            # Get the equilibrium
-            _f = _f_vec()
-            for l in range(self.velocity_set.q):
-                _f[l] = f[l, index[0], index[1]]
-            _pi = functional(_f)
-
-            # Set the output
-            for d in range(_pi_dim):
-                pi[d, index[0], index[1]] = self.store_dtype(_pi[d])
-
-        kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
-
         return functional, kernel
 
     @Operator.register_backend(ComputeBackend.WARP)
diff --git a/xlb/operator/macroscopic/zero_moment.py b/xlb/operator/macroscopic/zero_moment.py
index d0fbf51..8abb4de 100644
--- a/xlb/operator/macroscopic/zero_moment.py
+++ b/xlb/operator/macroscopic/zero_moment.py
@@ -27,7 +27,7 @@ def functional(f: _f_vec):
             return rho
 
         @wp.kernel
-        def kernel3d(
+        def kernel(
             f: wp.array4d(dtype=Any),
             rho: wp.array4d(dtype=Any),
         ):
@@ -41,23 +41,6 @@ def kernel3d(
 
             rho[0, index[0], index[1], index[2]] = _rho
 
-        @wp.kernel
-        def kernel2d(
-            f: wp.array3d(dtype=Any),
-            rho: wp.array3d(dtype=Any),
-        ):
-            i, j = wp.tid()
-            index = wp.vec2i(i, j)
-
-            _f = _f_vec()
-            for l in range(self.velocity_set.q):
-                _f[l] = f[l, index[0], index[1]]
-            _rho = functional(_f)
-
-            rho[0, index[0], index[1]] = _rho
-
-        kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
-
         return functional, kernel
 
     @Operator.register_backend(ComputeBackend.WARP)
diff --git a/xlb/operator/stepper/nse_stepper.py b/xlb/operator/stepper/nse_stepper.py
index 99431eb..e08e95c 100644
--- a/xlb/operator/stepper/nse_stepper.py
+++ b/xlb/operator/stepper/nse_stepper.py
@@ -140,27 +140,7 @@ def apply_bc(
             return f_result
 
         @wp.func
-        def get_thread_data_2d(
-            f0_buffer: wp.array3d(dtype=Any),
-            f1_buffer: wp.array3d(dtype=Any),
-            missing_mask: wp.array3d(dtype=Any),
-            index: Any,
-        ):
-            # Read thread data for populations and missing mask
-            _f0_thread = _f_vec()
-            _f1_thread = _f_vec()
-            _missing_mask = _missing_mask_vec()
-            for l in range(self.velocity_set.q):
-                _f0_thread[l] = self.compute_dtype(f0_buffer[l, index[0], index[1]])
-                _f1_thread[l] = self.compute_dtype(f1_buffer[l, index[0], index[1]])
-                if missing_mask[l, index[0], index[1]]:
-                    _missing_mask[l] = wp.uint8(1)
-                else:
-                    _missing_mask[l] = wp.uint8(0)
-            return _f0_thread, _f1_thread, _missing_mask
-
-        @wp.func
-        def get_thread_data_3d(
+        def get_thread_data(
             f0_buffer: wp.array4d(dtype=Any),
             f1_buffer: wp.array4d(dtype=Any),
             missing_mask: wp.array4d(dtype=Any),
@@ -182,47 +162,7 @@ def get_thread_data_3d(
             return _f0_thread, _f1_thread, _missing_mask
 
         @wp.kernel
-        def kernel2d(
-            f_0: wp.array3d(dtype=Any),
-            f_1: wp.array3d(dtype=Any),
-            bc_mask: wp.array3d(dtype=Any),
-            missing_mask: wp.array3d(dtype=Any),
-            timestep: int,
-        ):
-            i, j = wp.tid()
-            index = wp.vec2i(i, j)
-
-            _boundary_id = bc_mask[0, index[0], index[1]]
-            if _boundary_id == wp.uint8(255):
-                return
-
-            # Apply streaming
-            _f_post_stream = self.stream.warp_functional(f_0, index)
-
-            _f0_thread, _f1_thread, _missing_mask = get_thread_data_2d(f_0, f_1, missing_mask, index)
-            _f_post_collision = _f0_thread
-
-            # Apply post-streaming boundary conditions
-            _f_post_stream = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0, f_1, _f_post_collision, _f_post_stream, True)
-
-            # Compute rho and u
-            _rho, _u = self.macroscopic.warp_functional(_f_post_stream)
-
-            # Compute equilibrium
-            _feq = self.equilibrium.warp_functional(_rho, _u)
-
-            # Apply collision
-            _f_post_collision = self.collision.warp_functional(_f_post_stream, _feq, _rho, _u)
-
-            # Apply post-collision boundary conditions
-            _f_post_collision = apply_bc(index, timestep, _boundary_id, _missing_mask, f_0, f_1, _f_post_stream, _f_post_collision, False)
-
-            # Store the result in f_1
-            for l in range(self.velocity_set.q):
-                f_1[l, index[0], index[1]] = self.store_dtype(_f_post_collision[l])
-
-        @wp.kernel
-        def kernel3d(
+        def kernel(
             f_0: wp.array4d(dtype=Any),
             f_1: wp.array4d(dtype=Any),
             bc_mask: wp.array4d(dtype=Any),
@@ -239,7 +179,7 @@ def kernel3d(
             # Apply streaming
             _f_post_stream = self.stream.warp_functional(f_0, index)
 
-            _f0_thread, _f1_thread, _missing_mask = get_thread_data_3d(f_0, f_1, missing_mask, index)
+            _f0_thread, _f1_thread, _missing_mask = get_thread_data(f_0, f_1, missing_mask, index)
             _f_post_collision = _f0_thread
 
             # Apply post-streaming boundary conditions
@@ -261,9 +201,6 @@ def kernel3d(
                             f_0[_opp_indices[l], index[0], index[1], index[2]] = self.store_dtype(_f1_thread[_opp_indices[l]])
                 f_1[l, index[0], index[1], index[2]] = self.store_dtype(_f_post_collision[l])
 
-        # Return the correct kernel
-        kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
-
         return None, kernel
 
     @Operator.register_backend(ComputeBackend.WARP)
diff --git a/xlb/operator/stream/stream.py b/xlb/operator/stream/stream.py
index dc2417a..247fa5a 100644
--- a/xlb/operator/stream/stream.py
+++ b/xlb/operator/stream/stream.py
@@ -55,50 +55,9 @@ def _construct_warp(self):
         _c = self.velocity_set.c
         _f_vec = wp.vec(self.velocity_set.q, dtype=self.compute_dtype)
 
-        # Construct the warp functional
-        @wp.func
-        def functional2d(
-            f: wp.array3d(dtype=Any),
-            index: Any,
-        ):
-            # Pull the distribution function
-            _f = _f_vec()
-            for l in range(self.velocity_set.q):
-                # Get pull index
-                pull_index = type(index)()
-                for d in range(self.velocity_set.d):
-                    pull_index[d] = index[d] - _c[d, l]
-
-                    # impose periodicity for out of bound values
-                    if pull_index[d] < 0:
-                        pull_index[d] = f.shape[d + 1] - 1
-                    elif pull_index[d] >= f.shape[d + 1]:
-                        pull_index[d] = 0
-
-                # Read the distribution function
-                _f[l] = self.compute_dtype(f[l, pull_index[0], pull_index[1]])
-
-            return _f
-
-        @wp.kernel
-        def kernel2d(
-            f_0: wp.array3d(dtype=Any),
-            f_1: wp.array3d(dtype=Any),
-        ):
-            # Get the global index
-            i, j = wp.tid()
-            index = wp.vec2i(i, j)
-
-            # Set the output
-            _f = functional2d(f_0, index)
-
-            # Write the output
-            for l in range(self.velocity_set.q):
-                f_1[l, index[0], index[1]] = self.store_dtype(_f[l])
-
         # Construct the funcional to get streamed indices
         @wp.func
-        def functional3d(
+        def functional(
             f: wp.array4d(dtype=Any),
             index: Any,
         ):
@@ -124,7 +83,7 @@ def functional3d(
 
         # Construct the warp kernel
         @wp.kernel
-        def kernel3d(
+        def kernel(
             f_0: wp.array4d(dtype=Any),
             f_1: wp.array4d(dtype=Any),
         ):
@@ -133,15 +92,12 @@ def kernel3d(
             index = wp.vec3i(i, j, k)
 
             # Set the output
-            _f = functional3d(f_0, index)
+            _f = functional(f_0, index)
 
             # Write the output
             for l in range(self.velocity_set.q):
                 f_1[l, index[0], index[1], index[2]] = self.store_dtype(_f[l])
 
-        functional = functional3d if self.velocity_set.d == 3 else functional2d
-        kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
-
         return functional, kernel
 
     @Operator.register_backend(ComputeBackend.WARP)

From c0ea2a5be3ee92586d9293b7d9d243082caf4493 Mon Sep 17 00:00:00 2001
From: Mehdi Ataei <mehdi.ataei@autodesk.com>
Date: Fri, 18 Oct 2024 18:00:43 -0400
Subject: [PATCH 2/4] Changed the exmaple mesh address

---
 examples/cfd/windtunnel_3d.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/cfd/windtunnel_3d.py b/examples/cfd/windtunnel_3d.py
index c83e2c9..96c79f7 100644
--- a/examples/cfd/windtunnel_3d.py
+++ b/examples/cfd/windtunnel_3d.py
@@ -74,8 +74,8 @@ def define_boundary_indices(self):
         walls = [box["bottom"][i] + box["top"][i] + box["front"][i] + box["back"][i] for i in range(self.velocity_set.d)]
         walls = np.unique(np.array(walls), axis=-1).tolist()
 
-        # Load the mesh
-        stl_filename = "examples/cfd/stl-files/DrivAer-Notchback.stl"
+        # Load the mesh (replace with your own mesh)
+        stl_filename = "../stl-files/DrivAer-Notchback.stl"
         mesh = trimesh.load_mesh(stl_filename, process=False)
         mesh_vertices = mesh.vertices
 

From 27c0205af5782b107ccde5a1ac133489b07873ec Mon Sep 17 00:00:00 2001
From: Mehdi Ataei <mehdi.ataei@autodesk.com>
Date: Fri, 18 Oct 2024 18:02:26 -0400
Subject: [PATCH 3/4] Fixed ruff issue

---
 xlb/operator/boundary_masker/indices_boundary_masker.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/xlb/operator/boundary_masker/indices_boundary_masker.py b/xlb/operator/boundary_masker/indices_boundary_masker.py
index 36f9a36..0c1f7e1 100644
--- a/xlb/operator/boundary_masker/indices_boundary_masker.py
+++ b/xlb/operator/boundary_masker/indices_boundary_masker.py
@@ -93,7 +93,6 @@ def jax_implementation(self, bclist, bc_mask, missing_mask, start_index=None):
             bc_mask = bc_mask.at[0].set(bmap[pad_x:-pad_x, pad_y:-pad_y, pad_z:-pad_z])
         return bc_mask, missing_mask
 
-
     def _construct_warp(self):
         # Make constants for warp
         _c = self.velocity_set.c

From ed5f6435182718a76b40e14c0a2a94dddc3b802d Mon Sep 17 00:00:00 2001
From: Mehdi Ataei <mehdi.ataei@autodesk.com>
Date: Mon, 21 Oct 2024 17:06:23 -0400
Subject: [PATCH 4/4] Added back BC kernels in an abstract manner

---
 .../test_bc_equilibrium_warp.py               |  2 +-
 .../test_bc_fullway_bounce_back_warp.py       |  7 ++--
 .../boundary_condition/bc_do_nothing.py       | 14 +++++++-
 .../boundary_condition/bc_equilibrium.py      | 15 +++++++-
 .../bc_extrapolation_outflow.py               | 14 +++++++-
 .../bc_fullway_bounce_back.py                 | 14 +++++++-
 .../bc_grads_approximation.py                 | 14 +++++++-
 .../bc_halfway_bounce_back.py                 | 13 ++++++-
 .../boundary_condition/bc_regularized.py      | 14 +++++++-
 xlb/operator/boundary_condition/bc_zouhe.py   | 14 +++++++-
 .../boundary_condition/boundary_condition.py  | 35 +++++++++++++++++++
 11 files changed, 145 insertions(+), 11 deletions(-)

diff --git a/tests/boundary_conditions/bc_equilibrium/test_bc_equilibrium_warp.py b/tests/boundary_conditions/bc_equilibrium/test_bc_equilibrium_warp.py
index 5eb0c10..6bd9311 100644
--- a/tests/boundary_conditions/bc_equilibrium/test_bc_equilibrium_warp.py
+++ b/tests/boundary_conditions/bc_equilibrium/test_bc_equilibrium_warp.py
@@ -72,7 +72,7 @@ def test_bc_equilibrium_warp(dim, velocity_set, grid_shape):
     f = f.numpy()
     f_post = f_post.numpy()
 
-    assert f.shape == (velocity_set.q,) + grid_shape
+    assert f.shape == (velocity_set.q,) + grid_shape if dim == 3 else (velocity_set.q, grid_shape[0], grid_shape[1], 1)
 
     # Assert that the values are correct in the indices of the sphere
     weights = velocity_set.w
diff --git a/tests/boundary_conditions/bc_fullway_bounce_back/test_bc_fullway_bounce_back_warp.py b/tests/boundary_conditions/bc_fullway_bounce_back/test_bc_fullway_bounce_back_warp.py
index 10b9244..59c6c9d 100644
--- a/tests/boundary_conditions/bc_fullway_bounce_back/test_bc_fullway_bounce_back_warp.py
+++ b/tests/boundary_conditions/bc_fullway_bounce_back/test_bc_fullway_bounce_back_warp.py
@@ -58,7 +58,10 @@ def test_fullway_bounce_back_warp(dim, velocity_set, grid_shape):
     bc_mask, missing_mask = indices_boundary_masker([fullway_bc], bc_mask, missing_mask, start_index=None)
 
     # Generate a random field with the same shape
-    random_field = np.random.rand(velocity_set.q, *grid_shape).astype(np.float32)
+    if dim == 2:
+        random_field = np.random.rand(velocity_set.q, grid_shape[0], grid_shape[1], 1).astype(np.float32)
+    else:
+        random_field = np.random.rand(velocity_set.q, grid_shape[0], grid_shape[1], grid_shape[2]).astype(np.float32)
     # Add the random field to f_pre
     f_pre = wp.array(random_field)
 
@@ -71,7 +74,7 @@ def test_fullway_bounce_back_warp(dim, velocity_set, grid_shape):
     f = f_pre.numpy()
     f_post = f_post.numpy()
 
-    assert f.shape == (velocity_set.q,) + grid_shape
+    assert f.shape == (velocity_set.q,) + grid_shape if dim == 3 else (velocity_set.q, grid_shape[0], grid_shape[1], 1)
 
     for i in range(velocity_set.q):
         np.allclose(
diff --git a/xlb/operator/boundary_condition/bc_do_nothing.py b/xlb/operator/boundary_condition/bc_do_nothing.py
index 67b343d..56a332f 100644
--- a/xlb/operator/boundary_condition/bc_do_nothing.py
+++ b/xlb/operator/boundary_condition/bc_do_nothing.py
@@ -64,4 +64,16 @@ def functional(
         ):
             return f_pre
 
-        return functional, None
+        kernel = self._construct_kernel(functional)
+
+        return functional, kernel
+
+    @Operator.register_backend(ComputeBackend.WARP)
+    def warp_implementation(self, f_pre, f_post, bc_mask, missing_mask):
+        # Launch the warp kernel
+        wp.launch(
+            self.warp_kernel,
+            inputs=[f_pre, f_post, bc_mask, missing_mask],
+            dim=f_pre.shape[1:],
+        )
+        return f_post
\ No newline at end of file
diff --git a/xlb/operator/boundary_condition/bc_equilibrium.py b/xlb/operator/boundary_condition/bc_equilibrium.py
index b4b957a..77f408f 100644
--- a/xlb/operator/boundary_condition/bc_equilibrium.py
+++ b/xlb/operator/boundary_condition/bc_equilibrium.py
@@ -88,4 +88,17 @@ def functional(
             _f = self.equilibrium_operator.warp_functional(_rho, _u)
             return _f
 
-        return functional, None
+        # Use the parent class's kernel and pass the functional
+        kernel = self._construct_kernel(functional)
+
+        return functional, kernel
+
+    @Operator.register_backend(ComputeBackend.WARP)
+    def warp_implementation(self, f_pre, f_post, bc_mask, missing_mask):
+        # Launch the warp kernel
+        wp.launch(
+            self.warp_kernel,
+            inputs=[f_pre, f_post, bc_mask, missing_mask],
+            dim=f_pre.shape[1:],
+        )
+        return f_post
\ No newline at end of file
diff --git a/xlb/operator/boundary_condition/bc_extrapolation_outflow.py b/xlb/operator/boundary_condition/bc_extrapolation_outflow.py
index 4a96c73..38657e5 100644
--- a/xlb/operator/boundary_condition/bc_extrapolation_outflow.py
+++ b/xlb/operator/boundary_condition/bc_extrapolation_outflow.py
@@ -193,4 +193,16 @@ def prepare_bc_auxilary_data(
                     _f[_opp_indices[l]] = (self.compute_dtype(1.0) - sound_speed) * f_pre[l] + sound_speed * f_aux
             return _f
 
-        return (functional, prepare_bc_auxilary_data), None
\ No newline at end of file
+        kernel = self._construct_kernel(functional)
+
+        return (functional, prepare_bc_auxilary_data), kernel
+
+    @Operator.register_backend(ComputeBackend.WARP)
+    def warp_implementation(self, f_pre, f_post, bc_mask, missing_mask):
+        # Launch the warp kernel
+        wp.launch(
+            self.warp_kernel,
+            inputs=[f_pre, f_post, bc_mask, missing_mask],
+            dim=f_pre.shape[1:],
+        )
+        return f_post
\ No newline at end of file
diff --git a/xlb/operator/boundary_condition/bc_fullway_bounce_back.py b/xlb/operator/boundary_condition/bc_fullway_bounce_back.py
index afe05de..19a3013 100644
--- a/xlb/operator/boundary_condition/bc_fullway_bounce_back.py
+++ b/xlb/operator/boundary_condition/bc_fullway_bounce_back.py
@@ -74,4 +74,16 @@ def functional(
                 fliped_f[l] = f_pre[_opp_indices[l]]
             return fliped_f
 
-        return functional, None
\ No newline at end of file
+        kernel = self._construct_kernel(functional)
+
+        return functional, kernel
+
+    @Operator.register_backend(ComputeBackend.WARP)
+    def warp_implementation(self, f_pre, f_post, bc_mask, missing_mask):
+        # Launch the warp kernel
+        wp.launch(
+            self.warp_kernel,
+            inputs=[f_pre, f_post, bc_mask, missing_mask],
+            dim=f_pre.shape[1:],
+        )
+        return f_post
\ No newline at end of file
diff --git a/xlb/operator/boundary_condition/bc_grads_approximation.py b/xlb/operator/boundary_condition/bc_grads_approximation.py
index 5806375..94ddba3 100644
--- a/xlb/operator/boundary_condition/bc_grads_approximation.py
+++ b/xlb/operator/boundary_condition/bc_grads_approximation.py
@@ -309,4 +309,16 @@ def functional_method2(
 
         functional = functional_method1
 
-        return functional, None
\ No newline at end of file
+        kernel = self._construct_kernel(functional)
+
+        return functional, kernel
+
+    @Operator.register_backend(ComputeBackend.WARP)
+    def warp_implementation(self, f_pre, f_post, bc_mask, missing_mask):
+        # Launch the warp kernel
+        wp.launch(
+            self.warp_kernel,
+            inputs=[f_pre, f_post, bc_mask, missing_mask],
+            dim=f_pre.shape[1:],
+        )
+        return f_post
\ No newline at end of file
diff --git a/xlb/operator/boundary_condition/bc_halfway_bounce_back.py b/xlb/operator/boundary_condition/bc_halfway_bounce_back.py
index ee68b50..bf04af0 100644
--- a/xlb/operator/boundary_condition/bc_halfway_bounce_back.py
+++ b/xlb/operator/boundary_condition/bc_halfway_bounce_back.py
@@ -87,5 +87,16 @@ def functional(
 
             return _f
 
-        return functional, None
+        kernel = self._construct_kernel(functional)
 
+        return functional, kernel
+
+    @Operator.register_backend(ComputeBackend.WARP)
+    def warp_implementation(self, f_pre, f_post, bc_mask, missing_mask):
+        # Launch the warp kernel
+        wp.launch(
+            self.warp_kernel,
+            inputs=[f_pre, f_post, bc_mask, missing_mask],
+            dim=f_pre.shape[1:],
+        )
+        return f_post
diff --git a/xlb/operator/boundary_condition/bc_regularized.py b/xlb/operator/boundary_condition/bc_regularized.py
index 12622e2..af4c783 100644
--- a/xlb/operator/boundary_condition/bc_regularized.py
+++ b/xlb/operator/boundary_condition/bc_regularized.py
@@ -266,4 +266,16 @@ def functional_pressure(
             functional = functional_velocity
         elif self.bc_type == "pressure":
             functional = functional_pressure
-        return functional, None
\ No newline at end of file
+        kernel = self._construct_kernel(functional)
+
+        return functional, kernel
+
+    @Operator.register_backend(ComputeBackend.WARP)
+    def warp_implementation(self, f_pre, f_post, bc_mask, missing_mask):
+        # Launch the warp kernel
+        wp.launch(
+            self.warp_kernel,
+            inputs=[f_pre, f_post, bc_mask, missing_mask],
+            dim=f_pre.shape[1:],
+        )
+        return f_post
diff --git a/xlb/operator/boundary_condition/bc_zouhe.py b/xlb/operator/boundary_condition/bc_zouhe.py
index c5d9498..a92d909 100644
--- a/xlb/operator/boundary_condition/bc_zouhe.py
+++ b/xlb/operator/boundary_condition/bc_zouhe.py
@@ -283,4 +283,16 @@ def functional_pressure(
         elif self.bc_type == "velocity":
             functional = functional_pressure
 
-        return functional, None
\ No newline at end of file
+        kernel = self._construct_kernel(functional)
+
+        return functional, kernel
+
+    @Operator.register_backend(ComputeBackend.WARP)
+    def warp_implementation(self, f_pre, f_post, bc_mask, missing_mask):
+        # Launch the warp kernel
+        wp.launch(
+            self.warp_kernel,
+            inputs=[f_pre, f_post, bc_mask, missing_mask],
+            dim=f_pre.shape[1:],
+        )
+        return f_post
diff --git a/xlb/operator/boundary_condition/boundary_condition.py b/xlb/operator/boundary_condition/boundary_condition.py
index 2cd2a11..bf1eef2 100644
--- a/xlb/operator/boundary_condition/boundary_condition.py
+++ b/xlb/operator/boundary_condition/boundary_condition.py
@@ -111,3 +111,38 @@ def prepare_bc_auxilary_data(self, f_pre, f_post, bc_mask, missing_mask):
         currently being called after collision only.
         """
         return f_post
+
+    def _construct_kernel(self, functional):
+        """
+        Constructs the warp kernel for the boundary condition.
+        The functional is specific to each boundary condition and should be passed as an argument.
+        """
+        _id = wp.uint8(self.id)
+
+        # Construct the warp kernel
+        @wp.kernel
+        def kernel(
+            f_pre: wp.array4d(dtype=Any),
+            f_post: wp.array4d(dtype=Any),
+            bc_mask: wp.array4d(dtype=wp.uint8),
+            missing_mask: wp.array4d(dtype=wp.bool),
+        ):
+            # Get the global index
+            i, j, k = wp.tid()
+            index = wp.vec3i(i, j, k)
+
+            # read tid data
+            _f_pre, _f_post, _boundary_id, _missing_mask = self._get_thread_data(f_pre, f_post, bc_mask, missing_mask, index)
+
+            # Apply the boundary condition
+            if _boundary_id == _id:
+                timestep = 0
+                _f = functional(index, timestep, _missing_mask, f_pre, f_post, _f_pre, _f_post)
+            else:
+                _f = _f_post
+
+            # Write the result
+            for l in range(self.velocity_set.q):
+                f_post[l, index[0], index[1], index[2]] = self.store_dtype(_f[l])
+
+        return kernel