diff --git a/.github/workflows/tests_unit.yml b/.github/workflows/tests_unit.yml
index 28262cf16..5233b8fe6 100644
--- a/.github/workflows/tests_unit.yml
+++ b/.github/workflows/tests_unit.yml
@@ -16,6 +16,12 @@ on:
   # Allow manual triggers
   workflow_dispatch:
 
+env:
+  XDG_CACHE_HOME: /home/runner/work/milabench/cache
+  XDG_DATA_HOME: /home/runner/work/milabench/data
+  XDG_CONFIG_HOME: /home/runner/work/milabench/config
+  XDG_STATE_HOME: /home/runner/work/milabench/state
+
 
 jobs:
   tests:
@@ -27,6 +33,15 @@ jobs:
       cancel-in-progress: true
 
     steps:
+      - uses: easimon/maximize-build-space@master
+        with:
+          remove-dotnet: 'true'
+          remove-codeql: 'true'
+          remove-haskell: 'true'
+          remove-android: 'true'
+          build-mount-path: /home/runner/work/milabench/
+          root-reserve-mb: 20000
+
       - uses: actions/checkout@v3
 
       - uses: actions/setup-python@v5
@@ -35,24 +50,25 @@ jobs:
 
       - name: dependencies
         run: |
+          cd /home/runner/work/milabench/milabench
+          pip install virtualenv
+          virtualenv ./env
+          source ./env/bin/activate
+          #
           pip install -U pip
           pip install poetry
-          poetry env use python3.10
-          source $(poetry env info -p)/bin/activate
+          poetry export --dev -f requirements.txt --output requirements-dev.txt
           #
           # poetry doesnot work when installing those !?
           #
           pip install antlr4-python3-runtime==4.9.3
           pip install -e .
           pip install -e benchmate
-          #
-          #
-          #
-          poetry install --with dev
+          pip install coverage pytest-regressions pytest-cov pytest
 
       - name: Simple Template
         run: |
-          source $(poetry env info -p)/bin/activate
+          source ./env/bin/activate
           milabench new --name simplebench --template simple
           cd benchmarks/simplebench
           make tests
@@ -61,7 +77,7 @@ jobs:
         
       - name: Voir Template
         run: |
-          source $(poetry env info -p)/bin/activate
+          source ./env/bin/activate
           milabench new --name  voirbench --template voir
           cd benchmarks/voirbench
           make tests
@@ -74,10 +90,10 @@ jobs:
 
       - name: tests
         env:
-          MILABENCH_HF_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN}}
+          HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN}}
         run: |
-          source $(poetry env info -p)/bin/activate
-          coverage run --source=milabench -m pytest --ignore=tests/integration tests/ 
+          source ./env/bin/activate
+          coverage run --source=milabench -m pytest --ignore=tests/integration tests/ -vv -x
           coverage report -m
           coverage xml
 
diff --git a/.gitignore b/.gitignore
index 1bc7f879c..265dda6de 100644
--- a/.gitignore
+++ b/.gitignore
@@ -35,6 +35,7 @@ dependencies/
 benchmarks/recursiongfn/gflownet
 benchmarks/recursiongfn/logs/
 benchmarks/llm/tune/
+benchmarks/vjepa/jepa
 
 scripts/inventory.yaml
 output/
diff --git a/.pin/constraints-cuda-gnn.txt b/.pin/constraints-cuda-gnn.txt
index ce1c65e30..cacbdfeae 100644
--- a/.pin/constraints-cuda-gnn.txt
+++ b/.pin/constraints-cuda-gnn.txt
@@ -104,9 +104,9 @@ mpmath==1.3.0
     #   botorch
     #   gpytorch
     #   sympy
-msgpack==1.0.8
+msgpack==1.1.0
     # via blosc2
-multidict==6.0.5
+multidict==6.1.0
     # via
     #   aiohttp
     #   yarl
@@ -236,7 +236,7 @@ requests==2.32.3
     # via
     #   torch-geometric
     #   wandb
-rich==13.8.0
+rich==13.8.1
     # via voir
 scikit-learn==1.5.1
     # via
@@ -310,6 +310,7 @@ typeguard==2.13.3
     #   linear-operator
 typing-extensions==4.12.2
     # via
+    #   multidict
     #   reactivex
     #   tables
     #   torch
@@ -330,7 +331,7 @@ wandb==0.17.9
     # via -r benchmarks/recursiongfn/requirements.in
 werkzeug==3.0.4
     # via tensorboard
-yarl==1.11.0
+yarl==1.11.1
     # via aiohttp
 
 # The following packages are considered to be unsafe in a requirements file:
diff --git a/.pin/constraints-cuda-torch.txt b/.pin/constraints-cuda-torch.txt
index d0b1aa709..2717ed4ef 100644
--- a/.pin/constraints-cuda-torch.txt
+++ b/.pin/constraints-cuda-torch.txt
@@ -2,36 +2,43 @@
 # This file is autogenerated by pip-compile with Python 3.10
 # by the following command:
 #
-#    pip-compile --output-file=.pin/constraints-cuda-torch.txt .pin/tmp-constraints.txt benchmarks/brax/requirements.in benchmarks/diffusion/requirements.in benchmarks/dinov2/requirements.in benchmarks/flops/requirements.in benchmarks/huggingface/requirements.in benchmarks/lightning/requirements.in benchmarks/llama/requirements.in benchmarks/llava/requirements.in benchmarks/llm/requirements.in benchmarks/super-slomo/requirements.in benchmarks/timm/requirements.in benchmarks/torchatari/requirements.in benchmarks/torchvision/requirements.in benchmarks/torchvision_ddp/requirements.in constraints/extra/torch.cuda.txt
+#    pip-compile --output-file=.pin/constraints-cuda-torch.txt .pin/tmp-constraints.txt benchmarks/brax/requirements.in benchmarks/diffusion/requirements.in benchmarks/dinov2/requirements.in benchmarks/flops/requirements.in benchmarks/geo_gnn/requirements-pre.in benchmarks/geo_gnn/requirements.in benchmarks/huggingface/requirements.in benchmarks/lightning/requirements.in benchmarks/llama/requirements.in benchmarks/llava/requirements.in benchmarks/llm/requirements.in benchmarks/purejaxrl/requirements.in benchmarks/recursiongfn/requirements.in benchmarks/rlhf/requirements.in benchmarks/timm/requirements.in benchmarks/torchatari/requirements.in benchmarks/torchvision/requirements.in benchmarks/torchvision_ddp/requirements.in benchmarks/vjepa/requirements.in constraints/extra/torch.cuda.txt
 #
 --extra-index-url https://pypi.ngc.nvidia.com
 --extra-index-url https://download.pytorch.org/whl/cu121
 --find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
+--find-links https://data.pyg.org/whl/torch-2.4.0+cu121.html
 --trusted-host pypi.ngc.nvidia.com
 
 absl-py==2.1.0
     # via
     #   brax
     #   chex
+    #   distrax
     #   dm-env
     #   ml-collections
     #   mujoco
     #   mujoco-mjx
     #   optax
     #   orbax-checkpoint
+    #   rlax
     #   tensorboard
+    #   tensorflow-probability
 accelerate==0.34.2
     # via
     #   -r benchmarks/diffusion/requirements.in
     #   -r benchmarks/llava/requirements.in
     #   -r benchmarks/llm/requirements.in
+    #   -r benchmarks/rlhf/requirements.in
     #   diffusers
+    #   trl
 aiohappyeyeballs==2.4.0
     # via aiohttp
 aiohttp==3.10.5
     # via
     #   datasets
     #   fsspec
+    #   torch-geometric
 aiosignal==1.3.1
     # via aiohttp
 antlr4-python3-runtime==4.9.3
@@ -42,59 +49,114 @@ argklass==1.4.4
     # via
     #   -r benchmarks/diffusion/requirements.in
     #   -r benchmarks/llm/requirements.in
+    #   -r benchmarks/purejaxrl/requirements.in
+astroid==3.2.4
+    # via pylint
 asttokens==2.4.1
     # via giving
 async-timeout==4.0.3
     # via aiohttp
 attrs==24.2.0
     # via aiohttp
+beartype==0.18.5
+    # via -r benchmarks/vjepa/requirements.in
+black==24.8.0
+    # via navix
 blinker==1.8.2
     # via flask
 blobfile==3.0.0
     # via
     #   -r benchmarks/llm/requirements.txt
     #   torchtune
+blosc2==2.7.1
+    # via tables
+botorch==0.11.3
+    # via -r benchmarks/recursiongfn/requirements.in
+braceexpand==0.1.7
+    # via
+    #   -r benchmarks/vjepa/requirements.in
+    #   webdataset
 brax==0.10.5
-    # via -r benchmarks/brax/requirements.in
+    # via
+    #   -r benchmarks/brax/requirements.in
+    #   -r benchmarks/purejaxrl/requirements.in
 cantilever==0.1.0
     # via -r benchmarks/torchatari/requirements.in
 certifi==2024.8.30
-    # via requests
+    # via
+    #   requests
+    #   sentry-sdk
 charset-normalizer==3.3.2
     # via requests
 chex==0.1.86
-    # via optax
+    # via
+    #   distrax
+    #   evosax
+    #   flashbax
+    #   gymnax
+    #   optax
+    #   rlax
 click==8.1.7
-    # via flask
+    # via
+    #   black
+    #   flask
+    #   wandb
 cloudpickle==3.0.0
     # via
     #   gym
     #   gymnasium
     #   submitit
+    #   tensorflow-probability
 codefind==0.1.7
     # via ptera
 contextlib2==21.6.0
     # via ml-collections
-datasets==2.21.0
+contourpy==1.3.0
+    # via matplotlib
+cvxopt==1.3.2
+    # via -r benchmarks/recursiongfn/requirements.in
+cycler==0.12.1
+    # via matplotlib
+datasets==3.0.0
     # via
     #   -r benchmarks/diffusion/requirements.in
     #   -r benchmarks/llama/requirements.in
     #   -r benchmarks/llava/requirements.in
+    #   -r benchmarks/rlhf/requirements.in
     #   torchtune
+    #   trl
+decorator==5.1.1
+    # via tensorflow-probability
+decord==0.6.0
+    # via -r benchmarks/vjepa/requirements.in
 diffusers[torch]==0.30.2
     # via -r benchmarks/diffusion/requirements.in
 dill==0.3.8
     # via
     #   datasets
     #   multiprocess
+    #   pylint
+distrax==0.1.5
+    # via
+    #   -r benchmarks/purejaxrl/requirements.in
+    #   rlax
 dm-env==1.6
     # via
     #   brax
     #   envpool
+    #   rlax
 dm-tree==0.1.8
-    # via dm-env
+    # via
+    #   dm-env
+    #   tensorflow-probability
+docker-pycreds==0.4.0
+    # via wandb
 docstring-parser==0.16
     # via tyro
+dotmap==1.3.30
+    # via evosax
+einops==0.8.0
+    # via -r benchmarks/vjepa/requirements.in
 envpool==0.8.4
     # via -r benchmarks/torchatari/requirements.in
 etils[epath,epy]==1.9.4
@@ -104,7 +166,11 @@ etils[epath,epy]==1.9.4
     #   mujoco-mjx
     #   optax
     #   orbax-checkpoint
-executing==1.2.0
+evosax==0.1.6
+    # via -r benchmarks/purejaxrl/requirements.in
+exceptiongroup==1.2.2
+    # via pytest
+executing==2.1.0
     # via varname
 fairscale==0.4.13
     # via
@@ -126,6 +192,10 @@ fire==0.6.0
     # via
     #   -r benchmarks/llama/requirements.in
     #   -r benchmarks/llm/requirements.txt
+flake8==7.1.1
+    # via navix
+flashbax==0.1.2
+    # via -r benchmarks/purejaxrl/requirements.in
 flask==3.0.3
     # via
     #   brax
@@ -133,7 +203,15 @@ flask==3.0.3
 flask-cors==5.0.0
     # via brax
 flax==0.9.0
-    # via brax
+    # via
+    #   -r benchmarks/purejaxrl/requirements.in
+    #   brax
+    #   evosax
+    #   flashbax
+    #   gymnax
+    #   navix
+fonttools==4.53.1
+    # via matplotlib
 frozenlist==1.4.1
     # via
     #   aiohttp
@@ -146,45 +224,66 @@ fsspec[http]==2024.6.1
     #   lightning
     #   pytorch-lightning
     #   torch
+    #   torch-geometric
 fvcore==0.1.5.post20221221
     # via -r benchmarks/dinov2/requirements.in
-giving==0.4.2
+gast==0.6.0
+    # via tensorflow-probability
+gitdb==4.0.11
+    # via gitpython
+gitpython==3.1.43
+    # via
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   wandb
+giving==0.4.3
     # via
     #   ptera
     #   voir
 glfw==2.7.0
     # via mujoco
+gpytorch==1.12
+    # via
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   botorch
 grpcio==1.66.1
     # via
     #   brax
     #   tensorboard
-gym==0.23.1
+gym==0.26.2
     # via
     #   -r benchmarks/torchatari/requirements.in
     #   brax
     #   envpool
+    #   gymnax
 gym-notices==0.0.8
     # via gym
 gymnasium==0.29.1
-    # via envpool
+    # via
+    #   envpool
+    #   gymnax
+gymnax==0.0.8
+    # via
+    #   -c .pin/../constraints/cuda.txt
+    #   -r benchmarks/purejaxrl/requirements.in
 hjson==3.1.0
     # via argklass
-huggingface-hub==0.24.6
+huggingface-hub==0.24.7
     # via
     #   -r benchmarks/timm/requirements.in
     #   accelerate
     #   datasets
     #   diffusers
+    #   timm
     #   tokenizers
     #   torchtune
     #   transformers
 humanize==4.10.0
     # via orbax-checkpoint
-idna==3.8
+idna==3.10
     # via
     #   requests
     #   yarl
-importlib-metadata==8.4.0
+importlib-metadata==8.5.0
     # via diffusers
 importlib-resources==6.4.5
     # via
@@ -192,23 +291,33 @@ importlib-resources==6.4.5
     #   cantilever
     #   etils
     #   torchcompat
+iniconfig==2.0.0
+    # via pytest
 iopath==0.1.10
     # via
     #   -r benchmarks/dinov2/requirements.in
     #   fvcore
+isort==5.13.2
+    # via pylint
 itsdangerous==2.2.0
     # via flask
 jax[cuda12]==0.4.31
     # via
     #   -r benchmarks/brax/requirements.in
+    #   -r benchmarks/purejaxrl/requirements.in
     #   -r constraints/extra/torch.cuda.txt
     #   brax
     #   chex
+    #   distrax
+    #   evosax
+    #   flashbax
     #   flax
+    #   gymnax
     #   jaxopt
     #   mujoco-mjx
     #   optax
     #   orbax-checkpoint
+    #   rlax
 jax-cuda12-pjrt==0.4.31
     # via jax-cuda12-plugin
 jax-cuda12-plugin[with-cuda]==0.4.31
@@ -217,18 +326,30 @@ jaxlib==0.4.31
     # via
     #   brax
     #   chex
+    #   distrax
+    #   evosax
+    #   flashbax
+    #   gymnax
     #   jax
     #   jaxopt
     #   mujoco-mjx
     #   optax
     #   orbax-checkpoint
+    #   rlax
 jaxopt==0.8.3
     # via brax
+jaxtyping==0.2.34
+    # via linear-operator
 jinja2==3.1.4
     # via
     #   brax
     #   flask
     #   torch
+    #   torch-geometric
+joblib==1.4.2
+    # via scikit-learn
+kiwisolver==1.4.7
+    # via matplotlib
 lightning==2.4.0
     # via -r benchmarks/lightning/requirements.in
 lightning-utilities==0.11.7
@@ -236,6 +357,10 @@ lightning-utilities==0.11.7
     #   lightning
     #   pytorch-lightning
     #   torchmetrics
+linear-operator==0.5.2
+    # via
+    #   botorch
+    #   gpytorch
 lxml==5.3.0
     # via blobfile
 markdown==3.7
@@ -246,19 +371,32 @@ markupsafe==2.1.5
     # via
     #   jinja2
     #   werkzeug
+matplotlib==3.9.2
+    # via
+    #   evosax
+    #   gymnax
+    #   seaborn
+mccabe==0.7.0
+    # via
+    #   flake8
+    #   pylint
 mdurl==0.1.2
     # via markdown-it-py
 ml-collections==0.1.1
     # via brax
-ml-dtypes==0.4.0
+ml-dtypes==0.5.0
     # via
     #   jax
     #   jaxlib
     #   tensorstore
 mpmath==1.3.0
-    # via sympy
-msgpack==1.0.8
     # via
+    #   botorch
+    #   gpytorch
+    #   sympy
+msgpack==1.1.0
+    # via
+    #   blosc2
     #   flax
     #   orbax-checkpoint
 mujoco==3.2.2
@@ -267,52 +405,88 @@ mujoco==3.2.2
     #   mujoco-mjx
 mujoco-mjx==3.2.2
     # via brax
-multidict==6.0.5
+multidict==6.1.0
     # via
     #   aiohttp
     #   yarl
+multipledispatch==1.0.0
+    # via botorch
 multiprocess==0.70.16
     # via datasets
+mypy-extensions==1.0.0
+    # via black
+navix==0.7.0
+    # via -r benchmarks/purejaxrl/requirements.in
+ndindex==1.8
+    # via blosc2
 nest-asyncio==1.6.0
     # via orbax-checkpoint
 networkx==3.3
-    # via torch
+    # via
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   torch
+numexpr==2.10.1
+    # via
+    #   blosc2
+    #   tables
 numpy==1.26.4
     # via
+    #   -r benchmarks/geo_gnn/requirements.in
     #   -r benchmarks/llava/requirements.in
-    #   -r benchmarks/super-slomo/requirements.in
+    #   -r benchmarks/purejaxrl/requirements.in
     #   -r benchmarks/torchatari/requirements.in
+    #   -r benchmarks/vjepa/requirements.in
     #   accelerate
+    #   blosc2
+    #   botorch
     #   brax
     #   chex
+    #   contourpy
     #   datasets
+    #   decord
     #   diffusers
+    #   distrax
     #   dm-env
     #   envpool
+    #   evosax
     #   fairscale
+    #   flashbax
     #   fvcore
     #   gym
     #   gymnasium
     #   jax
     #   jaxlib
     #   jaxopt
+    #   matplotlib
     #   ml-dtypes
     #   mujoco
+    #   navix
+    #   numexpr
     #   opencv-python
     #   opt-einsum
     #   optax
     #   orbax-checkpoint
     #   pandas
     #   pyarrow
+    #   pyro-ppl
+    #   rdkit
+    #   rlax
+    #   scikit-learn
     #   scipy
+    #   seaborn
+    #   tables
     #   tensorboard
     #   tensorboardx
+    #   tensorflow-probability
     #   tensorstore
+    #   torch-geometric
     #   torchmetrics
     #   torchtune
     #   torchvision
     #   transformers
     #   trimesh
+    #   trl
+    #   webdataset
     #   xformers
 nvidia-cublas-cu12==12.1.3.1
     # via
@@ -367,19 +541,23 @@ nvidia-nvtx-cu12==12.1.105
 omegaconf==2.3.0
     # via
     #   -r benchmarks/dinov2/requirements.in
+    #   -r benchmarks/recursiongfn/requirements.in
     #   torchtune
     #   voir
 opencv-python==4.10.0.84
-    # via -r benchmarks/super-slomo/requirements.in
+    # via -r benchmarks/vjepa/requirements.in
 opt-einsum==3.3.0
-    # via jax
+    # via
+    #   jax
+    #   pyro-ppl
 optax==0.2.3
     # via
+    #   -r benchmarks/purejaxrl/requirements.in
     #   brax
     #   flax
 optree==0.12.1
     # via envpool
-orbax-checkpoint==0.6.1
+orbax-checkpoint==0.6.3
     # via
     #   brax
     #   flax
@@ -388,18 +566,30 @@ ovld==0.3.9
 packaging==24.1
     # via
     #   accelerate
+    #   black
     #   datasets
     #   envpool
     #   huggingface-hub
     #   lightning
     #   lightning-utilities
+    #   matplotlib
+    #   pytest
     #   pytorch-lightning
+    #   setuptools-scm
+    #   tables
     #   tensorboard
     #   tensorboardx
     #   torchmetrics
     #   transformers
 pandas==2.2.2
-    # via datasets
+    # via
+    #   -r benchmarks/geo_gnn/requirements.in
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   -r benchmarks/vjepa/requirements.in
+    #   datasets
+    #   seaborn
+pathspec==0.12.1
+    # via black
 pillow==10.4.0
     # via
     #   -r benchmarks/huggingface/requirements.in
@@ -407,55 +597,104 @@ pillow==10.4.0
     #   brax
     #   diffusers
     #   fvcore
+    #   matplotlib
+    #   navix
+    #   rdkit
     #   torchvision
+platformdirs==4.3.3
+    # via
+    #   black
+    #   pylint
+    #   wandb
+pluggy==1.5.0
+    # via pytest
 portalocker==2.10.1
     # via iopath
-protobuf==5.28.0
+protobuf==5.28.1
     # via
     #   orbax-checkpoint
     #   tensorboard
     #   tensorboardx
+    #   wandb
 psutil==5.9.8
     # via
     #   accelerate
+    #   torch-geometric
     #   voir
+    #   wandb
 ptera==1.4.1
     # via voir
+py-cpuinfo==9.0.0
+    # via
+    #   blosc2
+    #   tables
 pyarrow==17.0.0
-    # via datasets
+    # via
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   datasets
+pycodestyle==2.12.1
+    # via flake8
 pycryptodomex==3.20.0
     # via blobfile
+pyflakes==3.2.0
+    # via flake8
 pygments==2.18.0
     # via rich
+pylint==3.2.7
+    # via navix
 pyopengl==3.1.7
     # via mujoco
+pyparsing==3.1.4
+    # via
+    #   matplotlib
+    #   torch-geometric
+pyro-api==0.1.2
+    # via pyro-ppl
+pyro-ppl==1.9.1
+    # via
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   botorch
+pytest==8.3.3
+    # via navix
 python-dateutil==2.9.0.post0
-    # via pandas
+    # via
+    #   matplotlib
+    #   pandas
 pytinyrenderer==0.0.14
     # via brax
 pytorch-lightning==2.4.0
     # via lightning
-pytz==2024.1
+pytz==2024.2
     # via pandas
 pyyaml==6.0.2
     # via
     #   -r benchmarks/llm/requirements.in
     #   -r benchmarks/timm/requirements.in
+    #   -r benchmarks/vjepa/requirements.in
     #   accelerate
     #   datasets
+    #   evosax
     #   flax
     #   fvcore
+    #   gymnax
     #   huggingface-hub
     #   lightning
     #   ml-collections
     #   omegaconf
     #   orbax-checkpoint
     #   pytorch-lightning
+    #   timm
     #   transformers
+    #   wandb
+    #   webdataset
     #   yacs
+rdkit==2024.3.5
+    # via
+    #   -r benchmarks/geo_gnn/requirements.in
+    #   -r benchmarks/recursiongfn/requirements.in
 reactivex==4.0.4
     # via giving
-regex==2024.7.24
+regex==2024.9.11
     # via
     #   diffusers
     #   tiktoken
@@ -466,64 +705,111 @@ requests==2.32.3
     #   diffusers
     #   huggingface-hub
     #   tiktoken
+    #   torch-geometric
     #   transformers
-rich==13.8.0
+    #   wandb
+rich==13.8.1
     # via
     #   flax
     #   tyro
     #   voir
+rlax==0.1.6
+    # via navix
 safetensors==0.4.5
     # via
     #   -r benchmarks/timm/requirements.in
     #   accelerate
     #   diffusers
+    #   timm
     #   torchtune
     #   transformers
+scikit-learn==1.5.2
+    # via gpytorch
 scipy==1.14.1
     # via
     #   -r benchmarks/dinov2/requirements.in
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   botorch
     #   brax
+    #   gpytorch
     #   jax
     #   jaxlib
     #   jaxopt
+    #   linear-operator
     #   mujoco-mjx
+    #   scikit-learn
+    #   torch-cluster
+    #   torch-sparse
+seaborn==0.13.2
+    # via gymnax
 sentencepiece==0.2.0
     # via
     #   -r benchmarks/llama/requirements.in
     #   torchtune
+sentry-sdk==2.14.0
+    # via wandb
+setproctitle==1.3.3
+    # via wandb
+setuptools-scm==8.1.0
+    # via navix
 shtab==1.7.1
     # via tyro
 six==1.16.0
     # via
     #   asttokens
+    #   docker-pycreds
     #   fire
     #   ml-collections
     #   python-dateutil
     #   tensorboard
+    #   tensorflow-probability
+smmap==5.0.1
+    # via gitdb
 submitit==1.5.1
-    # via -r benchmarks/dinov2/requirements.in
+    # via
+    #   -r benchmarks/dinov2/requirements.in
+    #   -r benchmarks/vjepa/requirements.in
 sympy==1.13.2
     # via torch
+tables==3.10.1
+    # via -r benchmarks/recursiongfn/requirements.in
 tabulate==0.9.0
     # via fvcore
 tensorboard==2.17.1
-    # via -r benchmarks/torchatari/requirements.in
+    # via
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   -r benchmarks/torchatari/requirements.in
 tensorboard-data-server==0.7.2
     # via tensorboard
 tensorboardx==2.6.2.2
     # via brax
+tensorflow-probability==0.24.0
+    # via distrax
 tensorstore==0.1.65
     # via
+    #   flashbax
     #   flax
     #   orbax-checkpoint
 termcolor==2.4.0
     # via
     #   fire
     #   fvcore
+threadpoolctl==3.5.0
+    # via scikit-learn
 tiktoken==0.7.0
     # via torchtune
+timm==1.0.9
+    # via -r benchmarks/vjepa/requirements.in
 tokenizers==0.19.1
     # via transformers
+tomli==2.0.1
+    # via
+    #   black
+    #   pylint
+    #   pytest
+    #   setuptools-scm
+tomlkit==0.13.2
+    # via pylint
 toolz==0.12.1
     # via chex
 torch==2.4.0+cu121
@@ -531,25 +817,50 @@ torch==2.4.0+cu121
     #   -r benchmarks/brax/requirements.in
     #   -r benchmarks/dinov2/requirements.in
     #   -r benchmarks/flops/requirements.in
+    #   -r benchmarks/geo_gnn/requirements-pre.in
     #   -r benchmarks/huggingface/requirements.in
     #   -r benchmarks/lightning/requirements.in
     #   -r benchmarks/llama/requirements.in
     #   -r benchmarks/llava/requirements.in
     #   -r benchmarks/llm/requirements.in
     #   -r benchmarks/llm/requirements.txt
-    #   -r benchmarks/super-slomo/requirements.in
+    #   -r benchmarks/purejaxrl/requirements.in
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   -r benchmarks/rlhf/requirements.in
     #   -r benchmarks/timm/requirements.in
     #   -r benchmarks/torchatari/requirements.in
     #   -r benchmarks/torchvision/requirements.in
     #   -r benchmarks/torchvision_ddp/requirements.in
+    #   -r benchmarks/vjepa/requirements.in
     #   accelerate
+    #   botorch
     #   diffusers
     #   fairscale
     #   lightning
+    #   linear-operator
+    #   pyro-ppl
     #   pytorch-lightning
+    #   timm
     #   torchmetrics
     #   torchvision
+    #   trl
     #   xformers
+torch-cluster==1.6.3+pt24cu121
+    # via
+    #   -r benchmarks/geo_gnn/requirements.in
+    #   -r benchmarks/recursiongfn/requirements.in
+torch-geometric==2.6.0
+    # via
+    #   -r benchmarks/geo_gnn/requirements.in
+    #   -r benchmarks/recursiongfn/requirements.in
+torch-scatter==2.1.2+pt24cu121
+    # via
+    #   -r benchmarks/geo_gnn/requirements.in
+    #   -r benchmarks/recursiongfn/requirements.in
+torch-sparse==0.6.18+pt24cu121
+    # via
+    #   -r benchmarks/geo_gnn/requirements.in
+    #   -r benchmarks/recursiongfn/requirements.in
 torchao==0.3.1+cu121
     # via torchtune
 torchcompat==1.1.4
@@ -560,7 +871,7 @@ torchcompat==1.1.4
     #   -r benchmarks/torchatari/requirements.in
     #   -r benchmarks/torchvision/requirements.in
     #   -r benchmarks/torchvision_ddp/requirements.in
-torchmetrics==1.4.1
+torchmetrics==1.4.2
     # via
     #   -r benchmarks/dinov2/requirements.in
     #   lightning
@@ -573,15 +884,15 @@ torchvision==0.19.0+cu121
     #   -r benchmarks/dinov2/requirements.in
     #   -r benchmarks/flops/requirements.in
     #   -r benchmarks/lightning/requirements.in
-    #   -r benchmarks/super-slomo/requirements.in
     #   -r benchmarks/timm/requirements.in
     #   -r benchmarks/torchvision/requirements.in
     #   -r benchmarks/torchvision_ddp/requirements.in
+    #   -r benchmarks/vjepa/requirements.in
+    #   timm
 tqdm==4.66.5
     # via
     #   -r benchmarks/diffusion/requirements.in
     #   -r benchmarks/flops/requirements.in
-    #   -r benchmarks/super-slomo/requirements.in
     #   -r benchmarks/torchvision/requirements.in
     #   -r benchmarks/torchvision_ddp/requirements.in
     #   datasets
@@ -589,7 +900,9 @@ tqdm==4.66.5
     #   huggingface-hub
     #   iopath
     #   lightning
+    #   pyro-ppl
     #   pytorch-lightning
+    #   torch-geometric
     #   torchtune
     #   transformers
 transformers==4.44.2
@@ -599,42 +912,60 @@ transformers==4.44.2
     #   -r benchmarks/llama/requirements.in
     #   -r benchmarks/llava/requirements.in
     #   -r benchmarks/llm/requirements.in
+    #   -r benchmarks/rlhf/requirements.in
+    #   trl
 trimesh==4.4.9
     # via
     #   brax
     #   mujoco-mjx
 triton==3.0.0
     # via torch
+trl==0.10.1
+    # via -r benchmarks/rlhf/requirements.in
+typeguard==2.13.3
+    # via
+    #   jaxtyping
+    #   linear-operator
 types-protobuf==5.27.0.20240907
     # via envpool
 typing-extensions==4.12.2
     # via
+    #   astroid
+    #   black
     #   brax
     #   chex
     #   envpool
     #   etils
+    #   flashbax
     #   flax
     #   gymnasium
     #   huggingface-hub
     #   iopath
     #   lightning
     #   lightning-utilities
+    #   multidict
+    #   navix
     #   optree
     #   orbax-checkpoint
     #   pytorch-lightning
     #   reactivex
     #   submitit
+    #   tables
     #   torch
     #   tyro
 tyro==0.8.10
-    # via -r benchmarks/torchatari/requirements.in
+    # via
+    #   -r benchmarks/torchatari/requirements.in
+    #   navix
+    #   trl
 tzdata==2024.1
     # via pandas
-urllib3==2.2.2
+urllib3==2.2.3
     # via
     #   blobfile
     #   requests
-varname==0.10.0
+    #   sentry-sdk
+varname==0.13.3
     # via giving
 voir==0.2.19
     # via
@@ -643,16 +974,26 @@ voir==0.2.19
     #   -r benchmarks/diffusion/requirements.in
     #   -r benchmarks/dinov2/requirements.in
     #   -r benchmarks/flops/requirements.in
+    #   -r benchmarks/geo_gnn/requirements.in
     #   -r benchmarks/huggingface/requirements.in
     #   -r benchmarks/lightning/requirements.in
     #   -r benchmarks/llama/requirements.in
     #   -r benchmarks/llava/requirements.in
     #   -r benchmarks/llm/requirements.in
-    #   -r benchmarks/super-slomo/requirements.in
+    #   -r benchmarks/purejaxrl/requirements.in
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   -r benchmarks/rlhf/requirements.in
     #   -r benchmarks/timm/requirements.in
     #   -r benchmarks/torchatari/requirements.in
     #   -r benchmarks/torchvision/requirements.in
     #   -r benchmarks/torchvision_ddp/requirements.in
+    #   -r benchmarks/vjepa/requirements.in
+wandb==0.18.0
+    # via
+    #   -r benchmarks/recursiongfn/requirements.in
+    #   navix
+webdataset==0.2.100
+    # via -r benchmarks/vjepa/requirements.in
 werkzeug==3.0.4
     # via
     #   flask
@@ -665,9 +1006,9 @@ xxhash==3.5.0
     # via datasets
 yacs==0.1.8
     # via fvcore
-yarl==1.11.0
+yarl==1.11.1
     # via aiohttp
-zipp==3.20.1
+zipp==3.20.2
     # via
     #   etils
     #   importlib-metadata
diff --git a/benchmarks/brax/requirements.cuda.txt b/benchmarks/brax/requirements.cuda.txt
index 5666c0798..aa883171c 100644
--- a/benchmarks/brax/requirements.cuda.txt
+++ b/benchmarks/brax/requirements.cuda.txt
@@ -7,6 +7,7 @@
 --extra-index-url https://pypi.ngc.nvidia.com
 --extra-index-url https://download.pytorch.org/whl/cu121
 --find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
+--find-links https://data.pyg.org/whl/torch-2.4.0+cu121.html
 --trusted-host pypi.ngc.nvidia.com
 
 absl-py==2.1.0
@@ -72,7 +73,7 @@ etils[epath,epy]==1.9.4
     #   mujoco-mjx
     #   optax
     #   orbax-checkpoint
-executing==1.2.0
+executing==2.1.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   varname
@@ -99,7 +100,7 @@ fsspec==2024.6.1
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   etils
     #   torch
-giving==0.4.2
+giving==0.4.3
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   ptera
@@ -112,7 +113,7 @@ grpcio==1.66.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   brax
-gym==0.23.1
+gym==0.26.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   brax
@@ -189,7 +190,7 @@ ml-collections==0.1.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   brax
-ml-dtypes==0.4.0
+ml-dtypes==0.5.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   jax
@@ -199,7 +200,7 @@ mpmath==1.3.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   sympy
-msgpack==1.0.8
+msgpack==1.1.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   flax
@@ -323,7 +324,7 @@ optax==0.2.3
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   brax
     #   flax
-orbax-checkpoint==0.6.1
+orbax-checkpoint==0.6.3
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   brax
@@ -340,7 +341,7 @@ pillow==10.4.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   brax
-protobuf==5.28.0
+protobuf==5.28.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   orbax-checkpoint
@@ -376,7 +377,7 @@ reactivex==4.0.4
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   giving
-rich==13.8.0
+rich==13.8.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   flax
@@ -435,7 +436,7 @@ typing-extensions==4.12.2
     #   orbax-checkpoint
     #   reactivex
     #   torch
-varname==0.10.0
+varname==0.13.3
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   giving
@@ -452,7 +453,7 @@ xformers==0.0.27.post2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r .pin/../constraints/extra/torch.cuda.txt
-zipp==3.20.1
+zipp==3.20.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   etils
diff --git a/benchmarks/diffusion/main.py b/benchmarks/diffusion/main.py
index 4b060c05d..0bcb67d50 100755
--- a/benchmarks/diffusion/main.py
+++ b/benchmarks/diffusion/main.py
@@ -229,6 +229,12 @@ def batch_size(x):
     return observer, bench_monitor
 
 def main():
+    # ---
+    import resource
+    hard, _ = resource.getrlimit(resource.RLIMIT_NOFILE)
+    resource.setrlimit(resource.RLIMIT_NOFILE, (hard, hard))
+    # ---
+
     from benchmate.metrics import StopProgram
 
     observer, monitor = prepare_voir()
diff --git a/benchmarks/diffusion/requirements.cuda.txt b/benchmarks/diffusion/requirements.cuda.txt
index ae6aad547..6a062a7a0 100644
--- a/benchmarks/diffusion/requirements.cuda.txt
+++ b/benchmarks/diffusion/requirements.cuda.txt
@@ -7,6 +7,7 @@
 --extra-index-url https://pypi.ngc.nvidia.com
 --extra-index-url https://download.pytorch.org/whl/cu121
 --find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
+--find-links https://data.pyg.org/whl/torch-2.4.0+cu121.html
 --trusted-host pypi.ngc.nvidia.com
 
 accelerate==0.34.2
@@ -59,7 +60,7 @@ codefind==0.1.7
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   ptera
-datasets==2.21.0
+datasets==3.0.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/diffusion/requirements.in
@@ -72,7 +73,7 @@ dill==0.3.8
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   datasets
     #   multiprocess
-executing==1.2.0
+executing==2.1.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   varname
@@ -96,7 +97,7 @@ fsspec[http]==2024.6.1
     #   datasets
     #   huggingface-hub
     #   torch
-giving==0.4.2
+giving==0.4.3
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   ptera
@@ -105,7 +106,7 @@ hjson==3.1.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   argklass
-huggingface-hub==0.24.6
+huggingface-hub==0.24.7
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   accelerate
@@ -113,12 +114,12 @@ huggingface-hub==0.24.6
     #   diffusers
     #   tokenizers
     #   transformers
-idna==3.8
+idna==3.10
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   requests
     #   yarl
-importlib-metadata==8.4.0
+importlib-metadata==8.5.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   diffusers
@@ -158,7 +159,7 @@ mdurl==0.1.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   markdown-it-py
-ml-dtypes==0.4.0
+ml-dtypes==0.5.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   jax
@@ -167,7 +168,7 @@ mpmath==1.3.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   sympy
-multidict==6.0.5
+multidict==6.1.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   aiohttp
@@ -314,7 +315,7 @@ python-dateutil==2.9.0.post0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   pandas
-pytz==2024.1
+pytz==2024.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   pandas
@@ -330,7 +331,7 @@ reactivex==4.0.4
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   giving
-regex==2024.7.24
+regex==2024.9.11
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   diffusers
@@ -342,7 +343,7 @@ requests==2.32.3
     #   diffusers
     #   huggingface-hub
     #   transformers
-rich==13.8.0
+rich==13.8.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
@@ -400,17 +401,18 @@ typing-extensions==4.12.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   huggingface-hub
+    #   multidict
     #   reactivex
     #   torch
 tzdata==2024.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   pandas
-urllib3==2.2.2
+urllib3==2.2.3
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   requests
-varname==0.10.0
+varname==0.13.3
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   giving
@@ -427,11 +429,11 @@ xxhash==3.5.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   datasets
-yarl==1.11.0
+yarl==1.11.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   aiohttp
-zipp==3.20.1
+zipp==3.20.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   importlib-metadata
diff --git a/benchmarks/dinov2/requirements.cuda.txt b/benchmarks/dinov2/requirements.cuda.txt
index 2b9a2ad5a..aef36dbf3 100644
--- a/benchmarks/dinov2/requirements.cuda.txt
+++ b/benchmarks/dinov2/requirements.cuda.txt
@@ -7,6 +7,7 @@
 --extra-index-url https://pypi.ngc.nvidia.com
 --extra-index-url https://download.pytorch.org/whl/cu121
 --find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
+--find-links https://data.pyg.org/whl/torch-2.4.0+cu121.html
 --trusted-host pypi.ngc.nvidia.com
 
 antlr4-python3-runtime==4.9.3
@@ -25,7 +26,7 @@ codefind==0.1.7
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   ptera
-executing==1.2.0
+executing==2.1.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   varname
@@ -42,7 +43,7 @@ fvcore==0.1.5.post20221221
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/dinov2/requirements.in
-giving==0.4.2
+giving==0.4.3
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   ptera
@@ -88,7 +89,7 @@ mdurl==0.1.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   markdown-it-py
-ml-dtypes==0.4.0
+ml-dtypes==0.5.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   jax
@@ -231,7 +232,7 @@ reactivex==4.0.4
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   giving
-rich==13.8.0
+rich==13.8.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
@@ -268,7 +269,7 @@ torch==2.4.0+cu121
     #   torchmetrics
     #   torchvision
     #   xformers
-torchmetrics==1.4.1
+torchmetrics==1.4.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/dinov2/requirements.in
@@ -293,7 +294,7 @@ typing-extensions==4.12.2
     #   reactivex
     #   submitit
     #   torch
-varname==0.10.0
+varname==0.13.3
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   giving
diff --git a/benchmarks/flops/requirements.cuda.txt b/benchmarks/flops/requirements.cuda.txt
index da9d3cc6c..afb7ff130 100644
--- a/benchmarks/flops/requirements.cuda.txt
+++ b/benchmarks/flops/requirements.cuda.txt
@@ -7,6 +7,7 @@
 --extra-index-url https://pypi.ngc.nvidia.com
 --extra-index-url https://download.pytorch.org/whl/cu121
 --find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
+--find-links https://data.pyg.org/whl/torch-2.4.0+cu121.html
 --trusted-host pypi.ngc.nvidia.com
 
 antlr4-python3-runtime==4.9.3
@@ -21,7 +22,7 @@ codefind==0.1.7
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   ptera
-executing==1.2.0
+executing==2.1.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   varname
@@ -34,7 +35,7 @@ fsspec==2024.6.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
-giving==0.4.2
+giving==0.4.3
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   ptera
@@ -75,7 +76,7 @@ mdurl==0.1.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   markdown-it-py
-ml-dtypes==0.4.0
+ml-dtypes==0.5.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   jax
@@ -203,7 +204,7 @@ reactivex==4.0.4
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   giving
-rich==13.8.0
+rich==13.8.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
@@ -248,7 +249,7 @@ typing-extensions==4.12.2
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   reactivex
     #   torch
-varname==0.10.0
+varname==0.13.3
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   giving
diff --git a/benchmarks/geo_gnn/dev.yaml b/benchmarks/geo_gnn/dev.yaml
index 7fadaea5f..6f261c895 100644
--- a/benchmarks/geo_gnn/dev.yaml
+++ b/benchmarks/geo_gnn/dev.yaml
@@ -1,4 +1,4 @@
-dimenet:
+pna:
   inherits: _defaults
   definition: .
   install-variant: cuda
@@ -6,11 +6,11 @@ dimenet:
   plan:
     method: per_gpu
   argv:
-    --model: 'DimeNet'
-    --num-samples: 10000
-    --use3d: True
+    --model: 'PNA'
+    --num-samples: 100000
+    --batch-size: 4096
 
-pna:
+dimenet:
   inherits: _defaults
   definition: .
   install-variant: cuda
@@ -18,5 +18,7 @@ pna:
   plan:
     method: per_gpu
   argv:
-    --model: 'PNA'
-    --num-samples: 10000
\ No newline at end of file
+    --model: 'DimeNet'
+    --num-samples: 10000
+    --use3d: True
+    --batch-size: 512
\ No newline at end of file
diff --git a/benchmarks/geo_gnn/main.py b/benchmarks/geo_gnn/main.py
index 714707f65..71e1c8827 100644
--- a/benchmarks/geo_gnn/main.py
+++ b/benchmarks/geo_gnn/main.py
@@ -9,6 +9,7 @@
 from pcqm4m_subset import PCQM4Mv2Subset
 from torch_geometric.datasets import QM9
 from torch_geometric.loader import DataLoader
+from torch_geometric.nn import global_max_pool
 
 from benchmate.observer import BenchObserver
 
@@ -102,26 +103,25 @@ def main():
     args = parser().parse_args()
 
     def batch_size(x):
-        shape = x.y.shape
-        return shape[0]
+        # assert len(x.batch.unique()) == int(x.batch[-1] - x.batch[0] + 1)
+        return int(x.batch[-1] - x.batch[0] + 1)
 
     observer = BenchObserver(batch_size_fn=batch_size)
 
-    # train_dataset = PCQM4Mv2Subset(args.num_samples, args.root)
-    train_dataset = QM9(args.root)
+    train_dataset = PCQM4Mv2Subset(args.num_samples, args.root)
 
     sample = next(iter(train_dataset))
 
-    info = models[args.model](args, 
-                              sample=sample, 
-                              degree=lambda: train_degree(train_dataset),
+    info = models[args.model](
+        args,
+        sample=sample,
+        degree=lambda: train_degree(train_dataset),
     )
 
     TRAIN_mean, TRAIN_std = (
         mean(train_dataset).item(),
         std(train_dataset).item(),
     )
-    print("Train mean: {}\tTrain std: {}".format(TRAIN_mean, TRAIN_std))
 
     DataLoaderClass = DataLoader
     dataloader_kwargs = {}
@@ -131,7 +131,7 @@ def batch_size(x):
         batch_size=args.batch_size,
         shuffle=True,
         num_workers=args.num_workers,
-        **dataloader_kwargs
+        **dataloader_kwargs,
     )
 
     device = accelerator.fetch_device(0)
@@ -148,33 +148,26 @@ def batch_size(x):
     lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.epochs)
 
     num_batches = len(train_loader)
-    for epoch in range(1, args.epochs + 1):
-        model.train()
+    loader = observer.loader(train_loader)
 
-        for step, batch in enumerate(observer.iterate(train_loader)):
-            # QM9            => DataBatch(x=[290, 11], edge_index=[2, 602], edge_attr=[602, 4], y=[16, 19], pos=[290, 3], z=[290], smiles=[16], name=[16], idx=[16], batch=[290], ptr=[17])
-            # PCQM4Mv2Subset => DataBatch(x=[229,  9], edge_index=[2, 476], edge_attr=[476, 3], y=[16],     pos=[229, 3],          smiles=[16],                      batch=[229], ptr=[17])
+    model.train()  # No eval ever.
+    for epoch in range(1, args.epochs + 1):
+        for step, batch in enumerate(loader):
             batch = batch.to(device)
-            
+
             if args.use3d:
-                
-                if hasattr(batch, "z"):
-                    z = batch.z
-                else:
-                    z = batch.batch
-                
-                molecule_repr = model(z=z, pos=batch.pos, batch=batch.batch)
+                molecule_repr = model(z=batch.z, pos=batch.pos, batch=batch.batch)
             else:
-                molecule_repr = model(x=batch.x, batch=batch.batch, edge_index=batch.edge_index, batch_size=batch_size(batch))
+                molecule_repr = model(
+                    x=batch.x.type(torch.float),
+                    batch=batch.batch,
+                    edge_index=batch.edge_index,
+                    batch_size=batch_size(batch),
+                )
+                molecule_repr = global_max_pool(molecule_repr, batch.batch)
 
             pred = molecule_repr.squeeze()
 
-            # Dimenet   : pred: torch.Size([ 16, 19])
-            # PNA       : pred: torch.Size([292, 19]) <= (with x=batch.x) WTF !? 292 = batch.x.shape[0]
-            # batch     :       torch.Size([ 16, 19])
-            # print(molecule_repr.shape)
-            # print(batch.y.shape)
-            
             B = pred.size()[0]
             y = batch.y.view(B, -1)
             # normalize
@@ -192,7 +185,8 @@ def batch_size(x):
 
         lr_scheduler.step()
 
-        print("Epoch: {}\nLoss: {}".format(epoch))
+        if loader.is_done():
+            break
 
 
 if __name__ == "__main__":
diff --git a/benchmarks/geo_gnn/pcqm4m_subset.py b/benchmarks/geo_gnn/pcqm4m_subset.py
index 615aea2bb..2d6e0e2bd 100644
--- a/benchmarks/geo_gnn/pcqm4m_subset.py
+++ b/benchmarks/geo_gnn/pcqm4m_subset.py
@@ -35,6 +35,7 @@ def __init__(
             "smiles": str,
             "pos": dict(dtype=torch.float32, size=(-1, 3)),
             "y": float,
+            "z": dict(dtype=torch.long, size=(-1,)),
         }
 
         self.from_smiles = from_smiles or _from_smiles
@@ -49,12 +50,10 @@ def raw_file_names(self):
         ]
 
     def download(self):
-        print(self.raw_paths)
         if all(os.path.exists(path) for path in self.raw_paths):
             return
 
         # Download 2d graphs
-        print(self.raw_dir)
         super().download()
 
         # Download 3D coordinates
@@ -78,6 +77,9 @@ def process(self) -> None:
             data.pos = torch.tensor(
                 extra.GetConformer().GetPositions(), dtype=torch.float
             )
+            data.z = torch.tensor(
+                [atom.GetAtomicNum() for atom in extra.GetAtoms()], dtype=torch.long
+            )
 
             data_list.append(data)
             if (
@@ -104,4 +106,5 @@ def std(self):
     def serialize(self, data: BaseData) -> Dict[str, Any]:
         rval = super().serialize(data)
         rval["pos"] = data.pos
+        rval["z"] = data.z
         return rval
diff --git a/benchmarks/geo_gnn/prepare.py b/benchmarks/geo_gnn/prepare.py
index 2b352f8ce..b3ac374b0 100755
--- a/benchmarks/geo_gnn/prepare.py
+++ b/benchmarks/geo_gnn/prepare.py
@@ -12,7 +12,7 @@ def parser():
         "--num-samples",
         type=int,
         help="Number of samples to process in the dataset",
-        default=10000,
+        default=100000,
     )
     parser.add_argument(
         "--root",
@@ -26,7 +26,4 @@ def parser():
 if __name__ == "__main__":
     args, _ = parser().parse_known_args()
 
-    # TODO: Handle argument for the number of samples
-    train_dataset = QM9(args.root)
-    # dataset = PCQM4Mv2Subset(args.num_samples, root=args.root)
-
+    dataset = PCQM4Mv2Subset(args.num_samples, root=args.root)
diff --git a/benchmarks/geo_gnn/requirements-pre.cuda.txt b/benchmarks/geo_gnn/requirements-pre.cuda.txt
index 396cdd441..0ec4d88dd 100644
--- a/benchmarks/geo_gnn/requirements-pre.cuda.txt
+++ b/benchmarks/geo_gnn/requirements-pre.cuda.txt
@@ -2,104 +2,161 @@
 # This file is autogenerated by pip-compile with Python 3.10
 # by the following command:
 #
-#    pip-compile --output-file=benchmarks/geo_gnn/requirements-pre.cuda.txt .pin/tmp-constraints-cuda-geo_gnn.txt benchmarks/geo_gnn/requirements-pre.in
+#    pip-compile --output-file=benchmarks/geo_gnn/requirements-pre.cuda.txt .pin/tmp-constraints-cuda-dimenet.txt benchmarks/geo_gnn/requirements-pre.in
 #
 --extra-index-url https://pypi.ngc.nvidia.com
 --extra-index-url https://download.pytorch.org/whl/cu121
---find-links https://data.pyg.org/whl/torch-2.3.0+cu121.html
+--find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
+--find-links https://data.pyg.org/whl/torch-2.4.0+cu121.html
 --trusted-host pypi.ngc.nvidia.com
 
 filelock==3.16.0
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
     #   triton
-fsspec==2024.9.0
+fsspec==2024.6.1
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
+jax[cuda12]==0.4.31
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -r .pin/../constraints/extra/torch.cuda.txt
+jax-cuda12-pjrt==0.4.31
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax-cuda12-plugin
+jax-cuda12-plugin[with-cuda]==0.4.31
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax
+jaxlib==0.4.31
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax
 jinja2==3.1.4
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
 markupsafe==2.1.5
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   jinja2
+ml-dtypes==0.5.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax
+    #   jaxlib
 mpmath==1.3.0
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   sympy
 networkx==3.3
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
+numpy==1.26.4
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax
+    #   jaxlib
+    #   ml-dtypes
+    #   opt-einsum
+    #   scipy
+    #   xformers
 nvidia-cublas-cu12==12.1.3.1
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax-cuda12-plugin
     #   nvidia-cudnn-cu12
     #   nvidia-cusolver-cu12
     #   torch
 nvidia-cuda-cupti-cu12==12.1.105
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax-cuda12-plugin
     #   torch
+nvidia-cuda-nvcc-cu12==12.6.68
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax-cuda12-plugin
 nvidia-cuda-nvrtc-cu12==12.1.105
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
 nvidia-cuda-runtime-cu12==12.1.105
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax-cuda12-plugin
     #   torch
-nvidia-cudnn-cu12==8.9.2.26
+nvidia-cudnn-cu12==9.1.0.70
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax-cuda12-plugin
     #   torch
 nvidia-cufft-cu12==11.0.2.54
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax-cuda12-plugin
     #   torch
 nvidia-curand-cu12==10.3.2.106
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
 nvidia-cusolver-cu12==11.4.5.107
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax-cuda12-plugin
     #   torch
 nvidia-cusparse-cu12==12.1.0.106
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax-cuda12-plugin
     #   nvidia-cusolver-cu12
     #   torch
 nvidia-nccl-cu12==2.20.5
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax-cuda12-plugin
     #   torch
 nvidia-nvjitlink-cu12==12.6.68
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax-cuda12-plugin
     #   nvidia-cusolver-cu12
     #   nvidia-cusparse-cu12
 nvidia-nvtx-cu12==12.1.105
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
+opt-einsum==3.3.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax
+scipy==1.14.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax
+    #   jaxlib
 sympy==1.13.2
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
-torch==2.3.1+cu121
+torch==2.4.0+cu121
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
-    #   -r .pin/../constraints/extra/gnn.cuda.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/geo_gnn/requirements-pre.in
-triton==2.3.1
+    #   xformers
+triton==3.0.0
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
 typing-extensions==4.12.2
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
+xformers==0.0.27.post2
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -r .pin/../constraints/extra/torch.cuda.txt
diff --git a/benchmarks/geo_gnn/requirements.cuda.txt b/benchmarks/geo_gnn/requirements.cuda.txt
index 5bf4a0707..88e329e6d 100644
--- a/benchmarks/geo_gnn/requirements.cuda.txt
+++ b/benchmarks/geo_gnn/requirements.cuda.txt
@@ -2,339 +2,385 @@
 # This file is autogenerated by pip-compile with Python 3.10
 # by the following command:
 #
-#    pip-compile --output-file=benchmarks/geo_gnn/requirements.cuda.txt .pin/tmp-constraints-cuda-geo_gnn.txt benchmarks/geo_gnn/requirements-pre.cuda.txt benchmarks/geo_gnn/requirements.in
+#    pip-compile --output-file=benchmarks/geo_gnn/requirements.cuda.txt .pin/tmp-constraints-cuda-dimenet.txt benchmarks/geo_gnn/requirements-pre.cuda.txt benchmarks/geo_gnn/requirements.in
 #
 --extra-index-url https://pypi.ngc.nvidia.com
 --extra-index-url https://download.pytorch.org/whl/cu121
---find-links https://data.pyg.org/whl/torch-2.3.0+cu121.html
+--find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
+--find-links https://data.pyg.org/whl/torch-2.4.0+cu121.html
 --trusted-host pypi.ngc.nvidia.com
 
 aiohappyeyeballs==2.4.0
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   aiohttp
 aiohttp==3.10.5
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch-geometric
 aiosignal==1.3.1
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   aiohttp
 antlr4-python3-runtime==4.9.3
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   omegaconf
 asttokens==2.4.1
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   giving
 async-timeout==4.0.3
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   aiohttp
 attrs==24.2.0
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   aiohttp
 certifi==2024.8.30
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   requests
 charset-normalizer==3.3.2
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   requests
 codefind==0.1.7
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   ptera
-executing==1.2.0
+executing==2.1.0
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   varname
 filelock==3.16.0
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
     #   torch
     #   triton
 frozenlist==1.4.1
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   aiohttp
     #   aiosignal
-fsspec==2024.9.0
+fsspec==2024.6.1
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
     #   torch
     #   torch-geometric
-giving==0.4.2
+giving==0.4.3
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   ptera
     #   voir
-idna==3.8
+idna==3.10
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   requests
     #   yarl
+jax[cuda12]==0.4.31
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -r .pin/../constraints/extra/torch.cuda.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
+jax-cuda12-pjrt==0.4.31
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
+    #   jax-cuda12-plugin
+jax-cuda12-plugin[with-cuda]==0.4.31
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
+    #   jax
+jaxlib==0.4.31
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
+    #   jax
 jinja2==3.1.4
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
     #   torch
     #   torch-geometric
-joblib==1.4.2
-    # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
-    #   scikit-learn
 markdown-it-py==3.0.0
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   rich
 markupsafe==2.1.5
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
     #   jinja2
 mdurl==0.1.2
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   markdown-it-py
+ml-dtypes==0.5.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
+    #   jax
+    #   jaxlib
 mpmath==1.3.0
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
     #   sympy
-multidict==6.0.5
+multidict==6.1.0
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   aiohttp
     #   yarl
 networkx==3.3
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
     #   torch
 numpy==1.26.4
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
     #   -r benchmarks/geo_gnn/requirements.in
+    #   jax
+    #   jaxlib
+    #   ml-dtypes
+    #   opt-einsum
     #   pandas
     #   rdkit
-    #   scikit-learn
     #   scipy
     #   torch-geometric
+    #   xformers
 nvidia-cublas-cu12==12.1.3.1
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
+    #   jax-cuda12-plugin
     #   nvidia-cudnn-cu12
     #   nvidia-cusolver-cu12
     #   torch
 nvidia-cuda-cupti-cu12==12.1.105
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
+    #   jax-cuda12-plugin
     #   torch
+nvidia-cuda-nvcc-cu12==12.6.68
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
+    #   jax-cuda12-plugin
 nvidia-cuda-nvrtc-cu12==12.1.105
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
     #   torch
 nvidia-cuda-runtime-cu12==12.1.105
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
+    #   jax-cuda12-plugin
     #   torch
-nvidia-cudnn-cu12==8.9.2.26
+nvidia-cudnn-cu12==9.1.0.70
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
+    #   jax-cuda12-plugin
     #   torch
 nvidia-cufft-cu12==11.0.2.54
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
+    #   jax-cuda12-plugin
     #   torch
 nvidia-curand-cu12==10.3.2.106
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
     #   torch
 nvidia-cusolver-cu12==11.4.5.107
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
+    #   jax-cuda12-plugin
     #   torch
 nvidia-cusparse-cu12==12.1.0.106
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
+    #   jax-cuda12-plugin
     #   nvidia-cusolver-cu12
     #   torch
 nvidia-ml-py==12.560.30
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
 nvidia-nccl-cu12==2.20.5
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
+    #   jax-cuda12-plugin
     #   torch
 nvidia-nvjitlink-cu12==12.6.68
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
+    #   jax-cuda12-plugin
     #   nvidia-cusolver-cu12
     #   nvidia-cusparse-cu12
 nvidia-nvtx-cu12==12.1.105
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
     #   torch
 omegaconf==2.3.0
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
+opt-einsum==3.3.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
+    #   jax
 ovld==0.3.9
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
 pandas==2.2.2
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/geo_gnn/requirements.in
 pillow==10.4.0
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   rdkit
 psutil==5.9.8
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch-geometric
     #   voir
 ptera==1.4.1
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
 pygments==2.18.0
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   rich
 pyparsing==3.1.4
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch-geometric
 python-dateutil==2.9.0.post0
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   pandas
-pytz==2024.1
+pytz==2024.2
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   pandas
 pyyaml==6.0.2
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   omegaconf
 rdkit==2024.3.5
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/geo_gnn/requirements.in
 reactivex==4.0.4
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   giving
 requests==2.32.3
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch-geometric
-rich==13.8.0
+rich==13.8.1
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
-scikit-learn==1.5.1
-    # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
-    #   torch-geometric
 scipy==1.14.1
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
-    #   scikit-learn
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
+    #   jax
+    #   jaxlib
     #   torch-cluster
-    #   torch-geometric
     #   torch-sparse
 six==1.16.0
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   asttokens
     #   python-dateutil
 sympy==1.13.2
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
     #   torch
-threadpoolctl==3.5.0
-    # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
-    #   scikit-learn
-torch==2.3.1+cu121
+torch==2.4.0+cu121
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
-    #   -r .pin/../constraints/extra/gnn.cuda.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
-torch-cluster==1.6.3+pt23cu121
+    #   xformers
+torch-cluster==1.6.3+pt24cu121
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/geo_gnn/requirements.in
-torch-geometric==2.5.3
+torch-geometric==2.6.0
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/geo_gnn/requirements.in
-torch-scatter==2.1.2+pt23cu121
+torch-scatter==2.1.2+pt24cu121
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/geo_gnn/requirements.in
-torch-sparse==0.6.18+pt23cu121
+torch-sparse==0.6.18+pt24cu121
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/geo_gnn/requirements.in
 tqdm==4.66.5
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch-geometric
-triton==2.3.1
+triton==3.0.0
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
     #   torch
 typing-extensions==4.12.2
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
+    #   multidict
     #   reactivex
     #   torch
 tzdata==2024.1
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   pandas
-urllib3==2.2.2
+urllib3==2.2.3
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   requests
-varname==0.10.0
+varname==0.13.3
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   giving
 voir==0.2.19
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -c .pin/../constraints/cuda.txt
     #   -r benchmarks/geo_gnn/requirements.in
-yarl==1.11.0
+xformers==0.0.27.post2
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -r .pin/../constraints/extra/torch.cuda.txt
+    #   -r benchmarks/geo_gnn/requirements-pre.cuda.txt
+yarl==1.11.1
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   aiohttp
diff --git a/benchmarks/huggingface/requirements.cuda.txt b/benchmarks/huggingface/requirements.cuda.txt
index 22dd9bd40..d4bcacca7 100644
--- a/benchmarks/huggingface/requirements.cuda.txt
+++ b/benchmarks/huggingface/requirements.cuda.txt
@@ -7,6 +7,7 @@
 --extra-index-url https://pypi.ngc.nvidia.com
 --extra-index-url https://download.pytorch.org/whl/cu121
 --find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
+--find-links https://data.pyg.org/whl/torch-2.4.0+cu121.html
 --trusted-host pypi.ngc.nvidia.com
 
 antlr4-python3-runtime==4.9.3
@@ -29,7 +30,7 @@ codefind==0.1.7
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   ptera
-executing==1.2.0
+executing==2.1.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   varname
@@ -45,17 +46,17 @@ fsspec==2024.6.1
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   huggingface-hub
     #   torch
-giving==0.4.2
+giving==0.4.3
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   ptera
     #   voir
-huggingface-hub==0.24.6
+huggingface-hub==0.24.7
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   tokenizers
     #   transformers
-idna==3.8
+idna==3.10
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   requests
@@ -91,7 +92,7 @@ mdurl==0.1.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   markdown-it-py
-ml-dtypes==0.4.0
+ml-dtypes==0.5.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   jax
@@ -226,7 +227,7 @@ reactivex==4.0.4
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   giving
-regex==2024.7.24
+regex==2024.9.11
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   transformers
@@ -235,7 +236,7 @@ requests==2.32.3
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   huggingface-hub
     #   transformers
-rich==13.8.0
+rich==13.8.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
@@ -284,11 +285,11 @@ typing-extensions==4.12.2
     #   huggingface-hub
     #   reactivex
     #   torch
-urllib3==2.2.2
+urllib3==2.2.3
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   requests
-varname==0.10.0
+varname==0.13.3
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   giving
diff --git a/benchmarks/huggingface/tunableop_results0.csv b/benchmarks/huggingface/tunableop_results0.csv
deleted file mode 100644
index 6a38d561a..000000000
--- a/benchmarks/huggingface/tunableop_results0.csv
+++ /dev/null
@@ -1,17 +0,0 @@
-Validator,PT_VERSION,2.4.0
-Validator,ROCBLAS_VERSION,4.0.0-88df9726-dirty
-Validator,HIPBLASLT_VERSION,0.6.0-592518e7
-Validator,ROCM_VERSION,6.0.0.0-91-08e5094
-Validator,GCN_ARCH_NAME,gfx942:sramecc+:xnack-
-GemmTunableOp_float_NT,nt_768_3072_16384,Gemm_Rocblas_69720,0.751226
-GemmTunableOp_float_NT,nt_3072_768_16384,Gemm_Rocblas_69733,0.684042
-GemmTunableOp_float_NT,nt_768_768_16384,Gemm_Hipblaslt_NT_28806,0.264226
-GemmTunableOp_float_NT,nt_768_30522_16384,Gemm_Hipblaslt_NT_27808,5.73919
-GemmTunableOp_float_NN,nn_768_16384_3072,Gemm_Hipblaslt_NN_33293,0.701076
-GemmTunableOp_float_NN,nn_768_16384_768,Gemm_Hipblaslt_NN_33685,0.209309
-GemmTunableOp_float_NN,nn_3072_16384_768,Gemm_Hipblaslt_NN_33225,0.69655
-GemmTunableOp_float_NN,nn_768_16384_30522,Gemm_Hipblaslt_NN_33924,5.81957
-GemmTunableOp_float_TN,tn_30522_16384_768,Default,6.06459
-GemmTunableOp_float_TN,tn_768_16384_3072,Gemm_Hipblaslt_TN_34830,0.584625
-GemmTunableOp_float_TN,tn_3072_16384_768,Gemm_Rocblas_69037,0.742789
-GemmTunableOp_float_TN,tn_768_16384_768,Gemm_Rocblas_69047,0.211827
diff --git a/benchmarks/lightning/requirements.cuda.txt b/benchmarks/lightning/requirements.cuda.txt
index 2cb0780c3..d6823c252 100644
--- a/benchmarks/lightning/requirements.cuda.txt
+++ b/benchmarks/lightning/requirements.cuda.txt
@@ -7,6 +7,7 @@
 --extra-index-url https://pypi.ngc.nvidia.com
 --extra-index-url https://download.pytorch.org/whl/cu121
 --find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
+--find-links https://data.pyg.org/whl/torch-2.4.0+cu121.html
 --trusted-host pypi.ngc.nvidia.com
 
 aiohappyeyeballs==2.4.0
@@ -41,7 +42,7 @@ codefind==0.1.7
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   ptera
-executing==1.2.0
+executing==2.1.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   varname
@@ -61,12 +62,12 @@ fsspec[http]==2024.6.1
     #   lightning
     #   pytorch-lightning
     #   torch
-giving==0.4.2
+giving==0.4.3
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   ptera
     #   voir
-idna==3.8
+idna==3.10
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   yarl
@@ -116,7 +117,7 @@ mdurl==0.1.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   markdown-it-py
-ml-dtypes==0.4.0
+ml-dtypes==0.5.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   jax
@@ -125,7 +126,7 @@ mpmath==1.3.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   sympy
-multidict==6.0.5
+multidict==6.1.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   aiohttp
@@ -263,7 +264,7 @@ reactivex==4.0.4
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   giving
-rich==13.8.0
+rich==13.8.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
@@ -294,7 +295,7 @@ torchcompat==1.1.4
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -c .pin/../constraints/cuda.txt
     #   -r benchmarks/lightning/requirements.in
-torchmetrics==1.4.1
+torchmetrics==1.4.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   lightning
@@ -317,10 +318,11 @@ typing-extensions==4.12.2
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   lightning
     #   lightning-utilities
+    #   multidict
     #   pytorch-lightning
     #   reactivex
     #   torch
-varname==0.10.0
+varname==0.13.3
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   giving
@@ -333,7 +335,7 @@ xformers==0.0.27.post2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r .pin/../constraints/extra/torch.cuda.txt
-yarl==1.11.0
+yarl==1.11.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   aiohttp
diff --git a/benchmarks/llama/requirements.cuda.txt b/benchmarks/llama/requirements.cuda.txt
index a9a5f3e7a..7d972b40f 100644
--- a/benchmarks/llama/requirements.cuda.txt
+++ b/benchmarks/llama/requirements.cuda.txt
@@ -7,6 +7,7 @@
 --extra-index-url https://pypi.ngc.nvidia.com
 --extra-index-url https://download.pytorch.org/whl/cu121
 --find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
+--find-links https://data.pyg.org/whl/torch-2.4.0+cu121.html
 --trusted-host pypi.ngc.nvidia.com
 
 aiohappyeyeballs==2.4.0
@@ -50,7 +51,7 @@ codefind==0.1.7
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   ptera
-datasets==2.21.0
+datasets==3.0.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/llama/requirements.in
@@ -59,7 +60,7 @@ dill==0.3.8
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   datasets
     #   multiprocess
-executing==1.2.0
+executing==2.1.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   varname
@@ -90,18 +91,18 @@ fsspec[http]==2024.6.1
     #   datasets
     #   huggingface-hub
     #   torch
-giving==0.4.2
+giving==0.4.3
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   ptera
     #   voir
-huggingface-hub==0.24.6
+huggingface-hub==0.24.7
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   datasets
     #   tokenizers
     #   transformers
-idna==3.8
+idna==3.10
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   requests
@@ -138,7 +139,7 @@ mdurl==0.1.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   markdown-it-py
-ml-dtypes==0.4.0
+ml-dtypes==0.5.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   jax
@@ -147,7 +148,7 @@ mpmath==1.3.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   sympy
-multidict==6.0.5
+multidict==6.1.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   aiohttp
@@ -285,7 +286,7 @@ python-dateutil==2.9.0.post0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   pandas
-pytz==2024.1
+pytz==2024.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   pandas
@@ -300,7 +301,7 @@ reactivex==4.0.4
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   giving
-regex==2024.7.24
+regex==2024.9.11
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   transformers
@@ -310,7 +311,7 @@ requests==2.32.3
     #   datasets
     #   huggingface-hub
     #   transformers
-rich==13.8.0
+rich==13.8.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
@@ -369,17 +370,18 @@ typing-extensions==4.12.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   huggingface-hub
+    #   multidict
     #   reactivex
     #   torch
 tzdata==2024.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   pandas
-urllib3==2.2.2
+urllib3==2.2.3
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   requests
-varname==0.10.0
+varname==0.13.3
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   giving
@@ -396,7 +398,7 @@ xxhash==3.5.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   datasets
-yarl==1.11.0
+yarl==1.11.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   aiohttp
diff --git a/benchmarks/llava/main.py b/benchmarks/llava/main.py
index 94aec57b6..879baca01 100755
--- a/benchmarks/llava/main.py
+++ b/benchmarks/llava/main.py
@@ -103,6 +103,11 @@ def batch_size_fn(batch):
             inputs = processor(
                 text=prompt, images=image, return_tensors="pt", padding=True
             )
+
+            labels = inputs["input_ids"].clone()
+            labels[labels == processor.tokenizer.pad_token_id] = -100
+            inputs["labels"] = labels
+
             inputs = {
                 k: v.to(
                     accelerator.device,
@@ -111,8 +116,6 @@ def batch_size_fn(batch):
                 for k, v in inputs.items()
             }
 
-            inputs["labels"] = inputs["input_ids"]
-
             outputs = model(**inputs)
   
             loss = outputs.loss
diff --git a/benchmarks/llava/requirements.cuda.txt b/benchmarks/llava/requirements.cuda.txt
index bb2638766..02cc24fbc 100644
--- a/benchmarks/llava/requirements.cuda.txt
+++ b/benchmarks/llava/requirements.cuda.txt
@@ -7,6 +7,7 @@
 --extra-index-url https://pypi.ngc.nvidia.com
 --extra-index-url https://download.pytorch.org/whl/cu121
 --find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
+--find-links https://data.pyg.org/whl/torch-2.4.0+cu121.html
 --trusted-host pypi.ngc.nvidia.com
 
 accelerate==0.34.2
@@ -54,7 +55,7 @@ codefind==0.1.7
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   ptera
-datasets==2.21.0
+datasets==3.0.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/llava/requirements.in
@@ -63,7 +64,7 @@ dill==0.3.8
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   datasets
     #   multiprocess
-executing==1.2.0
+executing==2.1.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   varname
@@ -86,19 +87,19 @@ fsspec[http]==2024.6.1
     #   datasets
     #   huggingface-hub
     #   torch
-giving==0.4.2
+giving==0.4.3
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   ptera
     #   voir
-huggingface-hub==0.24.6
+huggingface-hub==0.24.7
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   accelerate
     #   datasets
     #   tokenizers
     #   transformers
-idna==3.8
+idna==3.10
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   requests
@@ -135,7 +136,7 @@ mdurl==0.1.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   markdown-it-py
-ml-dtypes==0.4.0
+ml-dtypes==0.5.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   jax
@@ -144,7 +145,7 @@ mpmath==1.3.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   sympy
-multidict==6.0.5
+multidict==6.1.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   aiohttp
@@ -289,7 +290,7 @@ python-dateutil==2.9.0.post0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   pandas
-pytz==2024.1
+pytz==2024.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   pandas
@@ -305,7 +306,7 @@ reactivex==4.0.4
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   giving
-regex==2024.7.24
+regex==2024.9.11
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   transformers
@@ -315,7 +316,7 @@ requests==2.32.3
     #   datasets
     #   huggingface-hub
     #   transformers
-rich==13.8.0
+rich==13.8.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
@@ -366,17 +367,18 @@ typing-extensions==4.12.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   huggingface-hub
+    #   multidict
     #   reactivex
     #   torch
 tzdata==2024.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   pandas
-urllib3==2.2.2
+urllib3==2.2.3
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   requests
-varname==0.10.0
+varname==0.13.3
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   giving
@@ -393,7 +395,7 @@ xxhash==3.5.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   datasets
-yarl==1.11.0
+yarl==1.11.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   aiohttp
diff --git a/benchmarks/llm/requirements.cuda.txt b/benchmarks/llm/requirements.cuda.txt
index 976e4eafd..0e1e0010a 100644
--- a/benchmarks/llm/requirements.cuda.txt
+++ b/benchmarks/llm/requirements.cuda.txt
@@ -7,6 +7,7 @@
 --extra-index-url https://pypi.ngc.nvidia.com
 --extra-index-url https://download.pytorch.org/whl/cu121
 --find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
+--find-links https://data.pyg.org/whl/torch-2.4.0+cu121.html
 --trusted-host pypi.ngc.nvidia.com
 
 accelerate==0.34.2
@@ -63,7 +64,7 @@ codefind==0.1.7
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   ptera
-datasets==2.21.0
+datasets==3.0.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torchtune
@@ -72,7 +73,7 @@ dill==0.3.8
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   datasets
     #   multiprocess
-executing==1.2.0
+executing==2.1.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   varname
@@ -105,7 +106,7 @@ fsspec[http]==2024.6.1
     #   datasets
     #   huggingface-hub
     #   torch
-giving==0.4.2
+giving==0.4.3
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   ptera
@@ -114,7 +115,7 @@ hjson==3.1.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   argklass
-huggingface-hub==0.24.6
+huggingface-hub==0.24.7
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   accelerate
@@ -122,7 +123,7 @@ huggingface-hub==0.24.6
     #   tokenizers
     #   torchtune
     #   transformers
-idna==3.8
+idna==3.10
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   requests
@@ -167,7 +168,7 @@ mdurl==0.1.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   markdown-it-py
-ml-dtypes==0.4.0
+ml-dtypes==0.5.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   jax
@@ -176,7 +177,7 @@ mpmath==1.3.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   sympy
-multidict==6.0.5
+multidict==6.1.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   aiohttp
@@ -323,7 +324,7 @@ python-dateutil==2.9.0.post0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   pandas
-pytz==2024.1
+pytz==2024.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   pandas
@@ -340,7 +341,7 @@ reactivex==4.0.4
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   giving
-regex==2024.7.24
+regex==2024.9.11
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   tiktoken
@@ -352,7 +353,7 @@ requests==2.32.3
     #   huggingface-hub
     #   tiktoken
     #   transformers
-rich==13.8.0
+rich==13.8.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
@@ -428,18 +429,19 @@ typing-extensions==4.12.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   huggingface-hub
+    #   multidict
     #   reactivex
     #   torch
 tzdata==2024.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   pandas
-urllib3==2.2.2
+urllib3==2.2.3
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   blobfile
     #   requests
-varname==0.10.0
+varname==0.13.3
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   giving
@@ -456,7 +458,7 @@ xxhash==3.5.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   datasets
-yarl==1.11.0
+yarl==1.11.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   aiohttp
diff --git a/benchmarks/llm/tune b/benchmarks/llm/tune
deleted file mode 160000
index a83eeff00..000000000
--- a/benchmarks/llm/tune
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit a83eeff0079a73ee04a11e8fc2573ed8f671b231
diff --git a/benchmarks/purejaxrl/dqn.py b/benchmarks/purejaxrl/dqn.py
index 16fa55f52..17c839147 100644
--- a/benchmarks/purejaxrl/dqn.py
+++ b/benchmarks/purejaxrl/dqn.py
@@ -11,8 +11,8 @@
 import optax
 import flax.linen as nn
 from flax.training.train_state import TrainState
-from gymnax.wrappers.purerl import FlattenObservationWrapper, LogWrapper
 import gymnax
+from gymnax.wrappers.purerl import FlattenObservationWrapper, LogWrapper
 import flashbax as fbx
 
 from benchmate.metrics import give_push
diff --git a/benchmarks/purejaxrl/requirements.cuda.txt b/benchmarks/purejaxrl/requirements.cuda.txt
index aa28e8cfe..a59468762 100644
--- a/benchmarks/purejaxrl/requirements.cuda.txt
+++ b/benchmarks/purejaxrl/requirements.cuda.txt
@@ -6,10 +6,13 @@
 #
 --extra-index-url https://pypi.ngc.nvidia.com
 --extra-index-url https://download.pytorch.org/whl/cu121
+--find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
+--find-links https://data.pyg.org/whl/torch-2.4.0+cu121.html
 --trusted-host pypi.ngc.nvidia.com
 
 absl-py==2.1.0
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   brax
     #   chex
     #   distrax
@@ -22,27 +25,45 @@ absl-py==2.1.0
     #   rlax
     #   tensorflow-probability
 antlr4-python3-runtime==4.9.3
-    # via omegaconf
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   omegaconf
 argklass==1.4.4
-    # via -r benchmarks/purejaxrl/requirements.in
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -r benchmarks/purejaxrl/requirements.in
 astroid==3.2.4
-    # via pylint
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   pylint
 asttokens==2.4.1
-    # via giving
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   giving
 black==24.8.0
-    # via navix
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   navix
 blinker==1.8.2
-    # via flask
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   flask
 brax==0.10.5
-    # via -r benchmarks/purejaxrl/requirements.in
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -r benchmarks/purejaxrl/requirements.in
 certifi==2024.8.30
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   requests
     #   sentry-sdk
 charset-normalizer==3.3.2
-    # via requests
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   requests
 chex==0.1.86
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   distrax
     #   evosax
     #   flashbax
@@ -51,75 +72,116 @@ chex==0.1.86
     #   rlax
 click==8.1.7
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   black
     #   flask
     #   wandb
 cloudpickle==3.0.0
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   gym
     #   gymnasium
     #   tensorflow-probability
 codefind==0.1.7
-    # via ptera
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   ptera
 contextlib2==21.6.0
-    # via ml-collections
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   ml-collections
 contourpy==1.3.0
-    # via matplotlib
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   matplotlib
 cycler==0.12.1
-    # via matplotlib
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   matplotlib
 decorator==5.1.1
-    # via tensorflow-probability
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   tensorflow-probability
 dill==0.3.8
-    # via pylint
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   pylint
 distrax==0.1.5
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/purejaxrl/requirements.in
     #   rlax
 dm-env==1.6
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   brax
     #   rlax
 dm-tree==0.1.8
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   dm-env
     #   tensorflow-probability
 docker-pycreds==0.4.0
-    # via wandb
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   wandb
 docstring-parser==0.16
-    # via tyro
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   tyro
 dotmap==1.3.30
-    # via evosax
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   evosax
 etils[epath,epy]==1.9.4
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   brax
     #   mujoco
     #   mujoco-mjx
     #   optax
     #   orbax-checkpoint
 evosax==0.1.6
-    # via -r benchmarks/purejaxrl/requirements.in
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -r benchmarks/purejaxrl/requirements.in
 exceptiongroup==1.2.2
-    # via pytest
-executing==1.2.0
-    # via varname
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   pytest
+executing==2.1.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   varname
 farama-notifications==0.0.4
-    # via gymnasium
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   gymnasium
 filelock==3.16.0
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
     #   triton
 flake8==7.1.1
-    # via navix
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   navix
 flashbax==0.1.2
-    # via -r benchmarks/purejaxrl/requirements.in
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -r benchmarks/purejaxrl/requirements.in
 flask==3.0.3
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   brax
     #   flask-cors
 flask-cors==5.0.0
-    # via brax
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   brax
 flax==0.9.0
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/purejaxrl/requirements.in
     #   brax
     #   evosax
@@ -127,53 +189,90 @@ flax==0.9.0
     #   gymnax
     #   navix
 fonttools==4.53.1
-    # via matplotlib
-fsspec==2024.9.0
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   matplotlib
+fsspec==2024.6.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   etils
     #   torch
 gast==0.6.0
-    # via tensorflow-probability
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   tensorflow-probability
 gitdb==4.0.11
-    # via gitpython
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   gitpython
 gitpython==3.1.43
-    # via wandb
-giving==0.4.2
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   wandb
+giving==0.4.3
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   ptera
     #   voir
 glfw==2.7.0
-    # via mujoco
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   mujoco
 grpcio==1.66.1
-    # via brax
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   brax
 gym==0.26.2
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   brax
     #   gymnax
 gym-notices==0.0.8
-    # via gym
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   gym
 gymnasium==0.29.1
-    # via gymnax
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   gymnax
 gymnax==0.0.8
-    # via -r benchmarks/purejaxrl/requirements.in
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -c .pin/../constraints/cuda.txt
+    #   -r benchmarks/purejaxrl/requirements.in
 hjson==3.1.0
-    # via argklass
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   argklass
 humanize==4.10.0
-    # via orbax-checkpoint
-idna==3.8
-    # via requests
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   orbax-checkpoint
+idna==3.10
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   requests
 importlib-resources==6.4.5
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   argklass
     #   etils
 iniconfig==2.0.0
-    # via pytest
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   pytest
 isort==5.13.2
-    # via pylint
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   pylint
 itsdangerous==2.2.0
-    # via flask
-jax==0.4.31
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   flask
+jax[cuda12]==0.4.31
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -r .pin/../constraints/extra/torch.cuda.txt
     #   -r benchmarks/purejaxrl/requirements.in
     #   brax
     #   chex
@@ -187,8 +286,17 @@ jax==0.4.31
     #   optax
     #   orbax-checkpoint
     #   rlax
+jax-cuda12-pjrt==0.4.31
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax-cuda12-plugin
+jax-cuda12-plugin[with-cuda]==0.4.31
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax
 jaxlib==0.4.31
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   brax
     #   chex
     #   distrax
@@ -202,60 +310,90 @@ jaxlib==0.4.31
     #   orbax-checkpoint
     #   rlax
 jaxopt==0.8.3
-    # via brax
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   brax
 jinja2==3.1.4
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   brax
     #   flask
     #   torch
 kiwisolver==1.4.7
-    # via matplotlib
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   matplotlib
 markdown-it-py==3.0.0
-    # via rich
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   rich
 markupsafe==2.1.5
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   jinja2
     #   werkzeug
 matplotlib==3.9.2
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   evosax
     #   gymnax
     #   seaborn
 mccabe==0.7.0
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   flake8
     #   pylint
 mdurl==0.1.2
-    # via markdown-it-py
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   markdown-it-py
 ml-collections==0.1.1
-    # via brax
-ml-dtypes==0.4.0
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   brax
+ml-dtypes==0.5.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   jax
     #   jaxlib
     #   tensorstore
 mpmath==1.3.0
-    # via sympy
-msgpack==1.0.8
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   sympy
+msgpack==1.1.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   flax
     #   orbax-checkpoint
 mujoco==3.2.2
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   brax
     #   mujoco-mjx
 mujoco-mjx==3.2.2
-    # via brax
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   brax
 mypy-extensions==1.0.0
-    # via black
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   black
 navix==0.7.0
-    # via -r benchmarks/purejaxrl/requirements.in
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -r benchmarks/purejaxrl/requirements.in
 nest-asyncio==1.6.0
-    # via orbax-checkpoint
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   orbax-checkpoint
 networkx==3.3
-    # via torch
-numpy==2.1.1
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   torch
+numpy==1.26.4
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/purejaxrl/requirements.in
     #   brax
     #   chex
@@ -284,112 +422,190 @@ numpy==2.1.1
     #   tensorflow-probability
     #   tensorstore
     #   trimesh
+    #   xformers
 nvidia-cublas-cu12==12.1.3.1
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax-cuda12-plugin
     #   nvidia-cudnn-cu12
     #   nvidia-cusolver-cu12
     #   torch
 nvidia-cuda-cupti-cu12==12.1.105
-    # via torch
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax-cuda12-plugin
+    #   torch
+nvidia-cuda-nvcc-cu12==12.6.68
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax-cuda12-plugin
 nvidia-cuda-nvrtc-cu12==12.1.105
-    # via torch
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   torch
 nvidia-cuda-runtime-cu12==12.1.105
-    # via torch
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax-cuda12-plugin
+    #   torch
 nvidia-cudnn-cu12==9.1.0.70
-    # via torch
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax-cuda12-plugin
+    #   torch
 nvidia-cufft-cu12==11.0.2.54
-    # via torch
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax-cuda12-plugin
+    #   torch
 nvidia-curand-cu12==10.3.2.106
-    # via torch
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   torch
 nvidia-cusolver-cu12==11.4.5.107
-    # via torch
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax-cuda12-plugin
+    #   torch
 nvidia-cusparse-cu12==12.1.0.106
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax-cuda12-plugin
     #   nvidia-cusolver-cu12
     #   torch
 nvidia-ml-py==12.560.30
-    # via voir
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   voir
 nvidia-nccl-cu12==2.20.5
-    # via torch
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax-cuda12-plugin
+    #   torch
 nvidia-nvjitlink-cu12==12.6.68
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax-cuda12-plugin
     #   nvidia-cusolver-cu12
     #   nvidia-cusparse-cu12
 nvidia-nvtx-cu12==12.1.105
-    # via torch
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   torch
 omegaconf==2.3.0
-    # via voir
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   voir
 opt-einsum==3.3.0
-    # via jax
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax
 optax==0.2.3
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/purejaxrl/requirements.in
     #   brax
     #   flax
-orbax-checkpoint==0.6.1
+orbax-checkpoint==0.6.3
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   brax
     #   flax
 ovld==0.3.9
-    # via voir
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   voir
 packaging==24.1
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   black
     #   matplotlib
     #   pytest
     #   setuptools-scm
     #   tensorboardx
 pandas==2.2.2
-    # via seaborn
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   seaborn
 pathspec==0.12.1
-    # via black
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   black
 pillow==10.4.0
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   brax
     #   matplotlib
     #   navix
-platformdirs==4.3.2
+platformdirs==4.3.3
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   black
     #   pylint
     #   wandb
 pluggy==1.5.0
-    # via pytest
-protobuf==5.28.0
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   pytest
+protobuf==5.28.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   orbax-checkpoint
     #   tensorboardx
     #   wandb
 psutil==5.9.8
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
     #   wandb
 ptera==1.4.1
-    # via voir
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   voir
 pycodestyle==2.12.1
-    # via flake8
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   flake8
 pyflakes==3.2.0
-    # via flake8
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   flake8
 pygments==2.18.0
-    # via rich
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   rich
 pylint==3.2.7
-    # via navix
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   navix
 pyopengl==3.1.7
-    # via mujoco
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   mujoco
 pyparsing==3.1.4
-    # via matplotlib
-pytest==8.3.2
-    # via navix
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   matplotlib
+pytest==8.3.3
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   navix
 python-dateutil==2.9.0.post0
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   matplotlib
     #   pandas
 pytinyrenderer==0.0.14
-    # via brax
-pytz==2024.1
-    # via pandas
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   brax
+pytz==2024.2
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   pandas
 pyyaml==6.0.2
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   evosax
     #   flax
     #   gymnax
@@ -398,73 +614,113 @@ pyyaml==6.0.2
     #   orbax-checkpoint
     #   wandb
 reactivex==4.0.4
-    # via giving
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   giving
 requests==2.32.3
-    # via wandb
-rich==13.8.0
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   wandb
+rich==13.8.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   flax
     #   tyro
     #   voir
 rlax==0.1.6
-    # via navix
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   navix
 scipy==1.14.1
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   brax
     #   jax
     #   jaxlib
     #   jaxopt
     #   mujoco-mjx
 seaborn==0.13.2
-    # via gymnax
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   gymnax
 sentry-sdk==2.14.0
-    # via wandb
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   wandb
 setproctitle==1.3.3
-    # via wandb
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   wandb
 setuptools-scm==8.1.0
-    # via navix
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   navix
 shtab==1.7.1
-    # via tyro
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   tyro
 six==1.16.0
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   asttokens
     #   docker-pycreds
     #   ml-collections
     #   python-dateutil
     #   tensorflow-probability
 smmap==5.0.1
-    # via gitdb
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   gitdb
 sympy==1.13.2
-    # via torch
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   torch
 tensorboardx==2.6.2.2
-    # via brax
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   brax
 tensorflow-probability==0.24.0
-    # via distrax
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   distrax
 tensorstore==0.1.65
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   flashbax
     #   flax
     #   orbax-checkpoint
 tomli==2.0.1
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   black
     #   pylint
     #   pytest
     #   setuptools-scm
 tomlkit==0.13.2
-    # via pylint
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   pylint
 toolz==0.12.1
-    # via chex
-torch==2.4.1+cu121
-    # via -r benchmarks/purejaxrl/requirements.in
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   chex
+torch==2.4.0+cu121
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -r benchmarks/purejaxrl/requirements.in
+    #   xformers
 trimesh==4.4.9
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   brax
     #   mujoco-mjx
 triton==3.0.0
-    # via torch
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   torch
 typing-extensions==4.12.2
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   astroid
     #   black
     #   brax
@@ -479,25 +735,43 @@ typing-extensions==4.12.2
     #   torch
     #   tyro
 tyro==0.8.10
-    # via navix
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   navix
 tzdata==2024.1
-    # via pandas
-urllib3==2.2.2
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   pandas
+urllib3==2.2.3
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   requests
     #   sentry-sdk
-varname==0.10.0
-    # via giving
+varname==0.13.3
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   giving
 voir==0.2.19
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -c .pin/../constraints/cuda.txt
     #   -r benchmarks/purejaxrl/requirements.in
-wandb==0.17.9
-    # via navix
+wandb==0.18.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   navix
 werkzeug==3.0.4
-    # via flask
-zipp==3.20.1
-    # via etils
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   flask
+xformers==0.0.27.post2
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -r .pin/../constraints/extra/torch.cuda.txt
+zipp==3.20.2
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   etils
 
 # The following packages are considered to be unsafe in a requirements file:
 # setuptools
diff --git a/benchmarks/recursiongfn/requirements.cuda.txt b/benchmarks/recursiongfn/requirements.cuda.txt
index b586dd4c3..89c02624f 100644
--- a/benchmarks/recursiongfn/requirements.cuda.txt
+++ b/benchmarks/recursiongfn/requirements.cuda.txt
@@ -2,198 +2,223 @@
 # This file is autogenerated by pip-compile with Python 3.10
 # by the following command:
 #
-#    pip-compile --output-file=benchmarks/recursiongfn/requirements.cuda.txt .pin/tmp-constraints-cuda-recursiongfn_gnn.txt benchmarks/recursiongfn/requirements.in
+#    pip-compile --output-file=benchmarks/recursiongfn/requirements.cuda.txt .pin/tmp-constraints-cuda-recursiongfn.txt benchmarks/recursiongfn/requirements.in
 #
 --extra-index-url https://pypi.ngc.nvidia.com
 --extra-index-url https://download.pytorch.org/whl/cu121
---find-links https://data.pyg.org/whl/torch-2.3.0+cu121.html
+--find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
+--find-links https://data.pyg.org/whl/torch-2.4.0+cu121.html
 --trusted-host pypi.ngc.nvidia.com
 
 absl-py==2.1.0
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   tensorboard
 aiohappyeyeballs==2.4.0
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   aiohttp
 aiohttp==3.10.5
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch-geometric
 aiosignal==1.3.1
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   aiohttp
 antlr4-python3-runtime==4.9.3
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   omegaconf
 asttokens==2.4.1
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   giving
 async-timeout==4.0.3
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   aiohttp
 attrs==24.2.0
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   aiohttp
 blosc2==2.7.1
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   tables
 botorch==0.11.3
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/recursiongfn/requirements.in
 certifi==2024.8.30
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   requests
     #   sentry-sdk
 charset-normalizer==3.3.2
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   requests
 click==8.1.7
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   wandb
 codefind==0.1.7
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   ptera
 cvxopt==1.3.2
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/recursiongfn/requirements.in
 docker-pycreds==0.4.0
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   wandb
-executing==1.2.0
+executing==2.1.0
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   varname
 filelock==3.16.0
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
     #   triton
 frozenlist==1.4.1
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   aiohttp
     #   aiosignal
-fsspec==2024.9.0
+fsspec==2024.6.1
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
     #   torch-geometric
 gitdb==4.0.11
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   gitpython
 gitpython==3.1.43
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/recursiongfn/requirements.in
     #   wandb
-giving==0.4.2
+giving==0.4.3
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   ptera
     #   voir
 gpytorch==1.12
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/recursiongfn/requirements.in
     #   botorch
 grpcio==1.66.1
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   tensorboard
-idna==3.8
+idna==3.10
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   requests
     #   yarl
+jax[cuda12]==0.4.31
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -r .pin/../constraints/extra/torch.cuda.txt
+jax-cuda12-pjrt==0.4.31
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax-cuda12-plugin
+jax-cuda12-plugin[with-cuda]==0.4.31
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax
+jaxlib==0.4.31
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax
 jaxtyping==0.2.34
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   linear-operator
 jinja2==3.1.4
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
     #   torch-geometric
 joblib==1.4.2
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   scikit-learn
 linear-operator==0.5.2
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   botorch
     #   gpytorch
 markdown==3.7
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   tensorboard
 markdown-it-py==3.0.0
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   rich
 markupsafe==2.1.5
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   jinja2
     #   werkzeug
 mdurl==0.1.2
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   markdown-it-py
+ml-dtypes==0.5.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax
+    #   jaxlib
 mpmath==1.3.0
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   botorch
     #   gpytorch
     #   sympy
-msgpack==1.0.8
+msgpack==1.1.0
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   blosc2
-multidict==6.0.5
+multidict==6.1.0
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   aiohttp
     #   yarl
 multipledispatch==1.0.0
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   botorch
 ndindex==1.8
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   blosc2
 networkx==3.3
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/recursiongfn/requirements.in
     #   torch
 numexpr==2.10.1
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   blosc2
     #   tables
 numpy==1.26.4
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   blosc2
     #   botorch
+    #   jax
+    #   jaxlib
+    #   ml-dtypes
     #   numexpr
     #   opt-einsum
     #   pandas
@@ -205,291 +230,311 @@ numpy==1.26.4
     #   tables
     #   tensorboard
     #   torch-geometric
+    #   xformers
 nvidia-cublas-cu12==12.1.3.1
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax-cuda12-plugin
     #   nvidia-cudnn-cu12
     #   nvidia-cusolver-cu12
     #   torch
 nvidia-cuda-cupti-cu12==12.1.105
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax-cuda12-plugin
     #   torch
+nvidia-cuda-nvcc-cu12==12.6.68
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax-cuda12-plugin
 nvidia-cuda-nvrtc-cu12==12.1.105
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
 nvidia-cuda-runtime-cu12==12.1.105
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax-cuda12-plugin
     #   torch
-nvidia-cudnn-cu12==8.9.2.26
+nvidia-cudnn-cu12==9.1.0.70
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax-cuda12-plugin
     #   torch
 nvidia-cufft-cu12==11.0.2.54
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax-cuda12-plugin
     #   torch
 nvidia-curand-cu12==10.3.2.106
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
 nvidia-cusolver-cu12==11.4.5.107
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax-cuda12-plugin
     #   torch
 nvidia-cusparse-cu12==12.1.0.106
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax-cuda12-plugin
     #   nvidia-cusolver-cu12
     #   torch
 nvidia-ml-py==12.560.30
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
 nvidia-nccl-cu12==2.20.5
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax-cuda12-plugin
     #   torch
 nvidia-nvjitlink-cu12==12.6.68
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax-cuda12-plugin
     #   nvidia-cusolver-cu12
     #   nvidia-cusparse-cu12
 nvidia-nvtx-cu12==12.1.105
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
 omegaconf==2.3.0
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/recursiongfn/requirements.in
     #   voir
 opt-einsum==3.3.0
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax
     #   pyro-ppl
 ovld==0.3.9
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
 packaging==24.1
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   tables
     #   tensorboard
 pandas==2.2.2
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/recursiongfn/requirements.in
 pillow==10.4.0
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   rdkit
-platformdirs==4.3.2
+platformdirs==4.3.3
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   wandb
-protobuf==5.28.0
+protobuf==5.28.1
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   tensorboard
     #   wandb
 psutil==5.9.8
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch-geometric
     #   voir
     #   wandb
 ptera==1.4.1
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
 py-cpuinfo==9.0.0
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   blosc2
     #   tables
 pyarrow==17.0.0
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/recursiongfn/requirements.in
 pygments==2.18.0
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   rich
 pyparsing==3.1.4
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch-geometric
 pyro-api==0.1.2
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   pyro-ppl
 pyro-ppl==1.9.1
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/recursiongfn/requirements.in
     #   botorch
 python-dateutil==2.9.0.post0
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   pandas
-pytz==2024.1
+pytz==2024.2
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   pandas
 pyyaml==6.0.2
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   omegaconf
     #   wandb
 rdkit==2024.3.5
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/recursiongfn/requirements.in
 reactivex==4.0.4
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   giving
 requests==2.32.3
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch-geometric
     #   wandb
-rich==13.8.0
+rich==13.8.1
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
-scikit-learn==1.5.1
+scikit-learn==1.5.2
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   gpytorch
-    #   torch-geometric
 scipy==1.14.1
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/recursiongfn/requirements.in
     #   botorch
     #   gpytorch
+    #   jax
+    #   jaxlib
     #   linear-operator
     #   scikit-learn
     #   torch-cluster
-    #   torch-geometric
     #   torch-sparse
 sentry-sdk==2.14.0
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   wandb
 setproctitle==1.3.3
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   wandb
 six==1.16.0
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   asttokens
     #   docker-pycreds
     #   python-dateutil
     #   tensorboard
 smmap==5.0.1
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   gitdb
 sympy==1.13.2
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
 tables==3.10.1
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/recursiongfn/requirements.in
 tensorboard==2.17.1
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/recursiongfn/requirements.in
 tensorboard-data-server==0.7.2
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   tensorboard
 threadpoolctl==3.5.0
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   scikit-learn
-torch==2.3.1+cu121
+torch==2.4.0+cu121
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
-    #   -r .pin/../constraints/extra/gnn.cuda.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/recursiongfn/requirements.in
     #   botorch
     #   linear-operator
     #   pyro-ppl
-torch-cluster==1.6.3+pt23cu121
+    #   xformers
+torch-cluster==1.6.3+pt24cu121
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/recursiongfn/requirements.in
-torch-geometric==2.5.3
+torch-geometric==2.6.0
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/recursiongfn/requirements.in
-torch-scatter==2.1.2+pt23cu121
+torch-scatter==2.1.2+pt24cu121
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/recursiongfn/requirements.in
-torch-sparse==0.6.18+pt23cu121
+torch-sparse==0.6.18+pt24cu121
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/recursiongfn/requirements.in
 tqdm==4.66.5
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   pyro-ppl
     #   torch-geometric
-triton==2.3.1
+triton==3.0.0
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
 typeguard==2.13.3
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   jaxtyping
     #   linear-operator
 typing-extensions==4.12.2
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   multidict
     #   reactivex
     #   tables
     #   torch
 tzdata==2024.1
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   pandas
-urllib3==2.2.2
+urllib3==2.2.3
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   requests
     #   sentry-sdk
-varname==0.10.0
+varname==0.13.3
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   giving
 voir==0.2.19
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -c .pin/../constraints/cuda.txt
     #   -r benchmarks/recursiongfn/requirements.in
-wandb==0.17.9
+wandb==0.18.0
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/recursiongfn/requirements.in
 werkzeug==3.0.4
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   tensorboard
-yarl==1.11.0
+xformers==0.0.27.post2
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -r .pin/../constraints/extra/torch.cuda.txt
+yarl==1.11.1
     # via
-    #   -c .pin/../.pin/constraints-cuda-gnn.txt
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   aiohttp
 
 # The following packages are considered to be unsafe in a requirements file:
diff --git a/benchmarks/rwkv/README.md b/benchmarks/retired/rwkv/README.md
similarity index 100%
rename from benchmarks/rwkv/README.md
rename to benchmarks/retired/rwkv/README.md
diff --git a/benchmarks/rwkv/benchfile.py b/benchmarks/retired/rwkv/benchfile.py
similarity index 100%
rename from benchmarks/rwkv/benchfile.py
rename to benchmarks/retired/rwkv/benchfile.py
diff --git a/benchmarks/rwkv/prepare.py b/benchmarks/retired/rwkv/prepare.py
similarity index 100%
rename from benchmarks/rwkv/prepare.py
rename to benchmarks/retired/rwkv/prepare.py
diff --git a/benchmarks/rwkv/requirements.cuda.txt b/benchmarks/retired/rwkv/requirements.cuda.txt
similarity index 100%
rename from benchmarks/rwkv/requirements.cuda.txt
rename to benchmarks/retired/rwkv/requirements.cuda.txt
diff --git a/benchmarks/rwkv/requirements.hpu.txt b/benchmarks/retired/rwkv/requirements.hpu.txt
similarity index 100%
rename from benchmarks/rwkv/requirements.hpu.txt
rename to benchmarks/retired/rwkv/requirements.hpu.txt
diff --git a/benchmarks/rwkv/requirements.in b/benchmarks/retired/rwkv/requirements.in
similarity index 100%
rename from benchmarks/rwkv/requirements.in
rename to benchmarks/retired/rwkv/requirements.in
diff --git a/benchmarks/rwkv/requirements.rocm.txt b/benchmarks/retired/rwkv/requirements.rocm.txt
similarity index 100%
rename from benchmarks/rwkv/requirements.rocm.txt
rename to benchmarks/retired/rwkv/requirements.rocm.txt
diff --git a/benchmarks/rwkv/requirements.xpu.txt b/benchmarks/retired/rwkv/requirements.xpu.txt
similarity index 100%
rename from benchmarks/rwkv/requirements.xpu.txt
rename to benchmarks/retired/rwkv/requirements.xpu.txt
diff --git a/benchmarks/rwkv/rwkv-v4neo/20B_tokenizer.json b/benchmarks/retired/rwkv/rwkv-v4neo/20B_tokenizer.json
similarity index 100%
rename from benchmarks/rwkv/rwkv-v4neo/20B_tokenizer.json
rename to benchmarks/retired/rwkv/rwkv-v4neo/20B_tokenizer.json
diff --git a/benchmarks/rwkv/rwkv-v4neo/LICENSE b/benchmarks/retired/rwkv/rwkv-v4neo/LICENSE
similarity index 100%
rename from benchmarks/rwkv/rwkv-v4neo/LICENSE
rename to benchmarks/retired/rwkv/rwkv-v4neo/LICENSE
diff --git a/benchmarks/rwkv/rwkv-v4neo/ORIGIN.md b/benchmarks/retired/rwkv/rwkv-v4neo/ORIGIN.md
similarity index 100%
rename from benchmarks/rwkv/rwkv-v4neo/ORIGIN.md
rename to benchmarks/retired/rwkv/rwkv-v4neo/ORIGIN.md
diff --git a/benchmarks/rwkv/rwkv-v4neo/chat.py b/benchmarks/retired/rwkv/rwkv-v4neo/chat.py
similarity index 100%
rename from benchmarks/rwkv/rwkv-v4neo/chat.py
rename to benchmarks/retired/rwkv/rwkv-v4neo/chat.py
diff --git a/benchmarks/rwkv/rwkv-v4neo/cuda/wkv_cuda.cu b/benchmarks/retired/rwkv/rwkv-v4neo/cuda/wkv_cuda.cu
similarity index 100%
rename from benchmarks/rwkv/rwkv-v4neo/cuda/wkv_cuda.cu
rename to benchmarks/retired/rwkv/rwkv-v4neo/cuda/wkv_cuda.cu
diff --git a/benchmarks/rwkv/rwkv-v4neo/cuda/wkv_cuda_bf16.cu b/benchmarks/retired/rwkv/rwkv-v4neo/cuda/wkv_cuda_bf16.cu
similarity index 100%
rename from benchmarks/rwkv/rwkv-v4neo/cuda/wkv_cuda_bf16.cu
rename to benchmarks/retired/rwkv/rwkv-v4neo/cuda/wkv_cuda_bf16.cu
diff --git a/benchmarks/rwkv/rwkv-v4neo/cuda/wkv_op.cpp b/benchmarks/retired/rwkv/rwkv-v4neo/cuda/wkv_op.cpp
similarity index 100%
rename from benchmarks/rwkv/rwkv-v4neo/cuda/wkv_op.cpp
rename to benchmarks/retired/rwkv/rwkv-v4neo/cuda/wkv_op.cpp
diff --git a/benchmarks/rwkv/rwkv-v4neo/cuda/wkv_op_bf16.cpp b/benchmarks/retired/rwkv/rwkv-v4neo/cuda/wkv_op_bf16.cpp
similarity index 100%
rename from benchmarks/rwkv/rwkv-v4neo/cuda/wkv_op_bf16.cpp
rename to benchmarks/retired/rwkv/rwkv-v4neo/cuda/wkv_op_bf16.cpp
diff --git a/benchmarks/rwkv/rwkv-v4neo/img_demoAE.py b/benchmarks/retired/rwkv/rwkv-v4neo/img_demoAE.py
similarity index 100%
rename from benchmarks/rwkv/rwkv-v4neo/img_demoAE.py
rename to benchmarks/retired/rwkv/rwkv-v4neo/img_demoAE.py
diff --git a/benchmarks/rwkv/rwkv-v4neo/run.py b/benchmarks/retired/rwkv/rwkv-v4neo/run.py
similarity index 100%
rename from benchmarks/rwkv/rwkv-v4neo/run.py
rename to benchmarks/retired/rwkv/rwkv-v4neo/run.py
diff --git a/benchmarks/rwkv/rwkv-v4neo/src/__init__.py b/benchmarks/retired/rwkv/rwkv-v4neo/src/__init__.py
similarity index 100%
rename from benchmarks/rwkv/rwkv-v4neo/src/__init__.py
rename to benchmarks/retired/rwkv/rwkv-v4neo/src/__init__.py
diff --git a/benchmarks/rwkv/rwkv-v4neo/src/binidx.py b/benchmarks/retired/rwkv/rwkv-v4neo/src/binidx.py
similarity index 100%
rename from benchmarks/rwkv/rwkv-v4neo/src/binidx.py
rename to benchmarks/retired/rwkv/rwkv-v4neo/src/binidx.py
diff --git a/benchmarks/rwkv/rwkv-v4neo/src/dataset.py b/benchmarks/retired/rwkv/rwkv-v4neo/src/dataset.py
similarity index 100%
rename from benchmarks/rwkv/rwkv-v4neo/src/dataset.py
rename to benchmarks/retired/rwkv/rwkv-v4neo/src/dataset.py
diff --git a/benchmarks/rwkv/rwkv-v4neo/src/model.py b/benchmarks/retired/rwkv/rwkv-v4neo/src/model.py
similarity index 100%
rename from benchmarks/rwkv/rwkv-v4neo/src/model.py
rename to benchmarks/retired/rwkv/rwkv-v4neo/src/model.py
diff --git a/benchmarks/rwkv/rwkv-v4neo/src/model_img.py b/benchmarks/retired/rwkv/rwkv-v4neo/src/model_img.py
similarity index 100%
rename from benchmarks/rwkv/rwkv-v4neo/src/model_img.py
rename to benchmarks/retired/rwkv/rwkv-v4neo/src/model_img.py
diff --git a/benchmarks/rwkv/rwkv-v4neo/src/model_run.py b/benchmarks/retired/rwkv/rwkv-v4neo/src/model_run.py
similarity index 100%
rename from benchmarks/rwkv/rwkv-v4neo/src/model_run.py
rename to benchmarks/retired/rwkv/rwkv-v4neo/src/model_run.py
diff --git a/benchmarks/rwkv/rwkv-v4neo/src/trainer.py b/benchmarks/retired/rwkv/rwkv-v4neo/src/trainer.py
similarity index 100%
rename from benchmarks/rwkv/rwkv-v4neo/src/trainer.py
rename to benchmarks/retired/rwkv/rwkv-v4neo/src/trainer.py
diff --git a/benchmarks/rwkv/rwkv-v4neo/src/utils.py b/benchmarks/retired/rwkv/rwkv-v4neo/src/utils.py
similarity index 100%
rename from benchmarks/rwkv/rwkv-v4neo/src/utils.py
rename to benchmarks/retired/rwkv/rwkv-v4neo/src/utils.py
diff --git a/benchmarks/rwkv/rwkv-v4neo/train.py b/benchmarks/retired/rwkv/rwkv-v4neo/train.py
similarity index 100%
rename from benchmarks/rwkv/rwkv-v4neo/train.py
rename to benchmarks/retired/rwkv/rwkv-v4neo/train.py
diff --git a/benchmarks/rwkv/rwkv-v4neo/verify.py b/benchmarks/retired/rwkv/rwkv-v4neo/verify.py
similarity index 100%
rename from benchmarks/rwkv/rwkv-v4neo/verify.py
rename to benchmarks/retired/rwkv/rwkv-v4neo/verify.py
diff --git a/benchmarks/rwkv/voirfile.py b/benchmarks/retired/rwkv/voirfile.py
similarity index 100%
rename from benchmarks/rwkv/voirfile.py
rename to benchmarks/retired/rwkv/voirfile.py
diff --git a/benchmarks/super-slomo/README.md b/benchmarks/retired/super-slomo/README.md
similarity index 100%
rename from benchmarks/super-slomo/README.md
rename to benchmarks/retired/super-slomo/README.md
diff --git a/benchmarks/super-slomo/benchfile.py b/benchmarks/retired/super-slomo/benchfile.py
similarity index 100%
rename from benchmarks/super-slomo/benchfile.py
rename to benchmarks/retired/super-slomo/benchfile.py
diff --git a/benchmarks/super-slomo/prepare.py b/benchmarks/retired/super-slomo/prepare.py
similarity index 100%
rename from benchmarks/super-slomo/prepare.py
rename to benchmarks/retired/super-slomo/prepare.py
diff --git a/benchmarks/super-slomo/requirements.cuda.txt b/benchmarks/retired/super-slomo/requirements.cuda.txt
similarity index 99%
rename from benchmarks/super-slomo/requirements.cuda.txt
rename to benchmarks/retired/super-slomo/requirements.cuda.txt
index b2a5dc620..66ce02581 100644
--- a/benchmarks/super-slomo/requirements.cuda.txt
+++ b/benchmarks/retired/super-slomo/requirements.cuda.txt
@@ -205,7 +205,7 @@ reactivex==4.0.4
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   giving
-rich==13.8.0
+rich==13.8.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
diff --git a/benchmarks/super-slomo/requirements.hpu.txt b/benchmarks/retired/super-slomo/requirements.hpu.txt
similarity index 100%
rename from benchmarks/super-slomo/requirements.hpu.txt
rename to benchmarks/retired/super-slomo/requirements.hpu.txt
diff --git a/benchmarks/super-slomo/requirements.in b/benchmarks/retired/super-slomo/requirements.in
similarity index 100%
rename from benchmarks/super-slomo/requirements.in
rename to benchmarks/retired/super-slomo/requirements.in
diff --git a/benchmarks/super-slomo/requirements.rocm.txt b/benchmarks/retired/super-slomo/requirements.rocm.txt
similarity index 100%
rename from benchmarks/super-slomo/requirements.rocm.txt
rename to benchmarks/retired/super-slomo/requirements.rocm.txt
diff --git a/benchmarks/super-slomo/requirements.xpu.txt b/benchmarks/retired/super-slomo/requirements.xpu.txt
similarity index 100%
rename from benchmarks/super-slomo/requirements.xpu.txt
rename to benchmarks/retired/super-slomo/requirements.xpu.txt
diff --git a/benchmarks/super-slomo/slomo/LICENSE b/benchmarks/retired/super-slomo/slomo/LICENSE
similarity index 100%
rename from benchmarks/super-slomo/slomo/LICENSE
rename to benchmarks/retired/super-slomo/slomo/LICENSE
diff --git a/benchmarks/super-slomo/slomo/ORIGIN.md b/benchmarks/retired/super-slomo/slomo/ORIGIN.md
similarity index 100%
rename from benchmarks/super-slomo/slomo/ORIGIN.md
rename to benchmarks/retired/super-slomo/slomo/ORIGIN.md
diff --git a/benchmarks/super-slomo/slomo/README.md b/benchmarks/retired/super-slomo/slomo/README.md
similarity index 100%
rename from benchmarks/super-slomo/slomo/README.md
rename to benchmarks/retired/super-slomo/slomo/README.md
diff --git a/benchmarks/super-slomo/slomo/data/adobe240fps/test_list.txt b/benchmarks/retired/super-slomo/slomo/data/adobe240fps/test_list.txt
similarity index 100%
rename from benchmarks/super-slomo/slomo/data/adobe240fps/test_list.txt
rename to benchmarks/retired/super-slomo/slomo/data/adobe240fps/test_list.txt
diff --git a/benchmarks/super-slomo/slomo/data/adobe240fps/train_list.txt b/benchmarks/retired/super-slomo/slomo/data/adobe240fps/train_list.txt
similarity index 100%
rename from benchmarks/super-slomo/slomo/data/adobe240fps/train_list.txt
rename to benchmarks/retired/super-slomo/slomo/data/adobe240fps/train_list.txt
diff --git a/benchmarks/super-slomo/slomo/data/create_dataset.py b/benchmarks/retired/super-slomo/slomo/data/create_dataset.py
similarity index 100%
rename from benchmarks/super-slomo/slomo/data/create_dataset.py
rename to benchmarks/retired/super-slomo/slomo/data/create_dataset.py
diff --git a/benchmarks/super-slomo/slomo/dataloader.py b/benchmarks/retired/super-slomo/slomo/dataloader.py
similarity index 100%
rename from benchmarks/super-slomo/slomo/dataloader.py
rename to benchmarks/retired/super-slomo/slomo/dataloader.py
diff --git a/benchmarks/super-slomo/slomo/eval.py b/benchmarks/retired/super-slomo/slomo/eval.py
similarity index 100%
rename from benchmarks/super-slomo/slomo/eval.py
rename to benchmarks/retired/super-slomo/slomo/eval.py
diff --git a/benchmarks/super-slomo/slomo/model.py b/benchmarks/retired/super-slomo/slomo/model.py
similarity index 100%
rename from benchmarks/super-slomo/slomo/model.py
rename to benchmarks/retired/super-slomo/slomo/model.py
diff --git a/benchmarks/super-slomo/slomo/synth.py b/benchmarks/retired/super-slomo/slomo/synth.py
similarity index 100%
rename from benchmarks/super-slomo/slomo/synth.py
rename to benchmarks/retired/super-slomo/slomo/synth.py
diff --git a/benchmarks/super-slomo/slomo/train.py b/benchmarks/retired/super-slomo/slomo/train.py
similarity index 100%
rename from benchmarks/super-slomo/slomo/train.py
rename to benchmarks/retired/super-slomo/slomo/train.py
diff --git a/benchmarks/super-slomo/voirfile.py b/benchmarks/retired/super-slomo/voirfile.py
similarity index 100%
rename from benchmarks/super-slomo/voirfile.py
rename to benchmarks/retired/super-slomo/voirfile.py
diff --git a/benchmarks/rlhf/main.py b/benchmarks/rlhf/main.py
index 5ddf459e4..0be12d282 100755
--- a/benchmarks/rlhf/main.py
+++ b/benchmarks/rlhf/main.py
@@ -17,8 +17,9 @@
 
 
 class PPOv2TrainerIntrumented(PPOv2Trainer):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
+    def __init__(self, config: PPOv2Config, *args, **kwargs):
+        config.report_to = []
+        super().__init__(config, *args, **kwargs)
 
         def batch_size_fn(batch):
             x, y = batch['input_ids'].shape
@@ -45,6 +46,7 @@ def save_model(self, *args, **kwargs):
 
 
 def main():
+
     parser = HfArgumentParser((PPOv2Config, ModelConfig))
     config, model_config = parser.parse_args_into_dataclasses()
     # remove output_dir if exists
diff --git a/benchmarks/rlhf/requirements.cuda.txt b/benchmarks/rlhf/requirements.cuda.txt
index 764afb978..12a24c6c4 100644
--- a/benchmarks/rlhf/requirements.cuda.txt
+++ b/benchmarks/rlhf/requirements.cuda.txt
@@ -2,53 +2,81 @@
 # This file is autogenerated by pip-compile with Python 3.10
 # by the following command:
 #
-#    pip-compile --output-file=benchmarks/rlhf/requirements.cuda.txt .pin/tmp-constraints-cuda-rlhf.txt benchmarks/rlhf/requirements.in
+#    pip-compile --output-file=benchmarks/rlhf/requirements.cuda.txt .pin/tmp-constraints-cuda-rlhf-gpus.txt benchmarks/rlhf/requirements.in
 #
 --extra-index-url https://pypi.ngc.nvidia.com
 --extra-index-url https://download.pytorch.org/whl/cu121
 --find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
+--find-links https://data.pyg.org/whl/torch-2.4.0+cu121.html
 --trusted-host pypi.ngc.nvidia.com
 
 accelerate==0.34.2
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/rlhf/requirements.in
     #   trl
 aiohappyeyeballs==2.4.0
-    # via aiohttp
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   aiohttp
 aiohttp==3.10.5
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   datasets
     #   fsspec
 aiosignal==1.3.1
-    # via aiohttp
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   aiohttp
 antlr4-python3-runtime==4.9.3
-    # via omegaconf
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   omegaconf
 asttokens==2.4.1
-    # via giving
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   giving
 async-timeout==4.0.3
-    # via aiohttp
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   aiohttp
 attrs==24.2.0
-    # via aiohttp
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   aiohttp
 certifi==2024.8.30
-    # via requests
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   requests
 charset-normalizer==3.3.2
-    # via requests
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   requests
 codefind==0.1.7
-    # via ptera
-datasets==2.21.0
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   ptera
+datasets==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/rlhf/requirements.in
     #   trl
 dill==0.3.8
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   datasets
     #   multiprocess
 docstring-parser==0.16
-    # via tyro
-executing==1.2.0
-    # via varname
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   tyro
+executing==2.1.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   varname
 filelock==3.16.0
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   datasets
     #   huggingface-hub
     #   torch
@@ -56,59 +84,89 @@ filelock==3.16.0
     #   triton
 frozenlist==1.4.1
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   aiohttp
     #   aiosignal
 fsspec[http]==2024.6.1
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   datasets
     #   huggingface-hub
     #   torch
-giving==0.4.2
+giving==0.4.3
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   ptera
     #   voir
-huggingface-hub==0.24.6
+huggingface-hub==0.24.7
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   accelerate
     #   datasets
     #   tokenizers
     #   transformers
-idna==3.8
+idna==3.10
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   requests
     #   yarl
 jax[cuda12]==0.4.31
-    # via -r .pin/../constraints/extra/torch.cuda.txt
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -r .pin/../constraints/extra/torch.cuda.txt
 jax-cuda12-pjrt==0.4.31
-    # via jax-cuda12-plugin
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax-cuda12-plugin
 jax-cuda12-plugin[with-cuda]==0.4.31
-    # via jax
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax
 jaxlib==0.4.31
-    # via jax
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax
 jinja2==3.1.4
-    # via torch
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   torch
 markdown-it-py==3.0.0
-    # via rich
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   rich
 markupsafe==2.1.5
-    # via jinja2
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jinja2
 mdurl==0.1.2
-    # via markdown-it-py
-ml-dtypes==0.4.0
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   markdown-it-py
+ml-dtypes==0.5.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   jax
     #   jaxlib
 mpmath==1.3.0
-    # via sympy
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   sympy
 multidict==6.1.0
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   aiohttp
     #   yarl
 multiprocess==0.70.16
-    # via datasets
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   datasets
 networkx==3.3
-    # via torch
-numpy==2.1.1
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   torch
+numpy==1.26.4
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   accelerate
     #   datasets
     #   jax
@@ -123,161 +181,239 @@ numpy==2.1.1
     #   xformers
 nvidia-cublas-cu12==12.1.3.1
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   jax-cuda12-plugin
     #   nvidia-cudnn-cu12
     #   nvidia-cusolver-cu12
     #   torch
 nvidia-cuda-cupti-cu12==12.1.105
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   jax-cuda12-plugin
     #   torch
 nvidia-cuda-nvcc-cu12==12.6.68
-    # via jax-cuda12-plugin
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax-cuda12-plugin
 nvidia-cuda-nvrtc-cu12==12.1.105
-    # via torch
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   torch
 nvidia-cuda-runtime-cu12==12.1.105
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   jax-cuda12-plugin
     #   torch
 nvidia-cudnn-cu12==9.1.0.70
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   jax-cuda12-plugin
     #   torch
 nvidia-cufft-cu12==11.0.2.54
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   jax-cuda12-plugin
     #   torch
 nvidia-curand-cu12==10.3.2.106
-    # via torch
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   torch
 nvidia-cusolver-cu12==11.4.5.107
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   jax-cuda12-plugin
     #   torch
 nvidia-cusparse-cu12==12.1.0.106
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   jax-cuda12-plugin
     #   nvidia-cusolver-cu12
     #   torch
 nvidia-ml-py==12.560.30
-    # via voir
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   voir
 nvidia-nccl-cu12==2.20.5
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   jax-cuda12-plugin
     #   torch
 nvidia-nvjitlink-cu12==12.6.68
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   jax-cuda12-plugin
     #   nvidia-cusolver-cu12
     #   nvidia-cusparse-cu12
 nvidia-nvtx-cu12==12.1.105
-    # via torch
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   torch
 omegaconf==2.3.0
-    # via voir
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   voir
 opt-einsum==3.3.0
-    # via jax
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax
 ovld==0.3.9
-    # via voir
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   voir
 packaging==24.1
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   accelerate
     #   datasets
     #   huggingface-hub
     #   transformers
 pandas==2.2.2
-    # via datasets
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   datasets
 psutil==5.9.8
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   accelerate
     #   voir
 ptera==1.4.1
-    # via voir
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   voir
 pyarrow==17.0.0
-    # via datasets
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   datasets
 pygments==2.18.0
-    # via rich
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   rich
 python-dateutil==2.9.0.post0
-    # via pandas
-pytz==2024.1
-    # via pandas
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   pandas
+pytz==2024.2
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   pandas
 pyyaml==6.0.2
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   accelerate
     #   datasets
     #   huggingface-hub
     #   omegaconf
     #   transformers
 reactivex==4.0.4
-    # via giving
-regex==2024.7.24
-    # via transformers
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   giving
+regex==2024.9.11
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   transformers
 requests==2.32.3
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   datasets
     #   huggingface-hub
     #   transformers
 rich==13.8.1
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   tyro
     #   voir
 safetensors==0.4.5
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   accelerate
     #   transformers
 scipy==1.14.1
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   jax
     #   jaxlib
 shtab==1.7.1
-    # via tyro
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   tyro
 six==1.16.0
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   asttokens
     #   python-dateutil
 sympy==1.13.2
-    # via torch
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   torch
 tokenizers==0.19.1
-    # via transformers
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   transformers
 torch==2.4.0+cu121
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/rlhf/requirements.in
     #   accelerate
     #   trl
     #   xformers
 tqdm==4.66.5
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   datasets
     #   huggingface-hub
     #   transformers
 transformers==4.44.2
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/rlhf/requirements.in
     #   trl
 triton==3.0.0
-    # via torch
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   torch
 trl==0.10.1
-    # via -r benchmarks/rlhf/requirements.in
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -r benchmarks/rlhf/requirements.in
 typing-extensions==4.12.2
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   huggingface-hub
     #   multidict
     #   reactivex
     #   torch
     #   tyro
 tyro==0.8.10
-    # via trl
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   trl
 tzdata==2024.1
-    # via pandas
-urllib3==2.2.2
-    # via requests
-varname==0.10.0
-    # via giving
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   pandas
+urllib3==2.2.3
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   requests
+varname==0.13.3
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   giving
 voir==0.2.19
     # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -c .pin/../constraints/cuda.txt
     #   -r benchmarks/rlhf/requirements.in
 xformers==0.0.27.post2
-    # via -r .pin/../constraints/extra/torch.cuda.txt
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -r .pin/../constraints/extra/torch.cuda.txt
 xxhash==3.5.0
-    # via datasets
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   datasets
 yarl==1.11.1
-    # via aiohttp
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   aiohttp
diff --git a/benchmarks/timm/requirements.cuda.txt b/benchmarks/timm/requirements.cuda.txt
index 98586aca3..4554f91ec 100644
--- a/benchmarks/timm/requirements.cuda.txt
+++ b/benchmarks/timm/requirements.cuda.txt
@@ -7,6 +7,7 @@
 --extra-index-url https://pypi.ngc.nvidia.com
 --extra-index-url https://download.pytorch.org/whl/cu121
 --find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
+--find-links https://data.pyg.org/whl/torch-2.4.0+cu121.html
 --trusted-host pypi.ngc.nvidia.com
 
 antlr4-python3-runtime==4.9.3
@@ -29,7 +30,7 @@ codefind==0.1.7
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   ptera
-executing==1.2.0
+executing==2.1.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   varname
@@ -44,16 +45,16 @@ fsspec==2024.6.1
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   huggingface-hub
     #   torch
-giving==0.4.2
+giving==0.4.3
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   ptera
     #   voir
-huggingface-hub==0.24.6
+huggingface-hub==0.24.7
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/timm/requirements.in
-idna==3.8
+idna==3.10
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   requests
@@ -89,7 +90,7 @@ mdurl==0.1.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   markdown-it-py
-ml-dtypes==0.4.0
+ml-dtypes==0.5.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   jax
@@ -227,7 +228,7 @@ requests==2.32.3
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   huggingface-hub
-rich==13.8.0
+rich==13.8.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
@@ -272,11 +273,11 @@ typing-extensions==4.12.2
     #   huggingface-hub
     #   reactivex
     #   torch
-urllib3==2.2.2
+urllib3==2.2.3
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   requests
-varname==0.10.0
+varname==0.13.3
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   giving
diff --git a/benchmarks/torchatari/main.py b/benchmarks/torchatari/main.py
index 62c9b3a07..bf5b7ef65 100644
--- a/benchmarks/torchatari/main.py
+++ b/benchmarks/torchatari/main.py
@@ -97,7 +97,12 @@ def reset(self, **kwargs):
         return observations
 
     def step(self, action):
-        observations, rewards, dones, infos = super().step(action)
+        # np, np, np, np, dict
+        data = super().step(action)
+        
+        # FIXME: make sure this is valid
+        observations, rewards, terminated, truncated, infos = data
+
         self.episode_returns += infos["reward"]
         self.episode_lengths += 1
         self.returned_episode_returns[:] = self.episode_returns
@@ -109,7 +114,7 @@ def step(self, action):
         return (
             observations,
             rewards,
-            dones,
+            terminated,
             infos,
         )
 
@@ -211,7 +216,10 @@ def main():
     # TRY NOT TO MODIFY: start the game
     global_step = 0
     start_time = time.time()
-    next_obs = torch.Tensor(envs.reset()).to(device)
+    state, _ = envs.reset()
+
+    # print(type(a), type(b))
+    next_obs = torch.Tensor(state).to(device)
     next_done = torch.zeros(args.num_envs).to(device)
     iterations = range(1, args.num_iterations + 1)
 
diff --git a/benchmarks/torchatari/requirements.cuda.txt b/benchmarks/torchatari/requirements.cuda.txt
index b1a6d380b..2b0aa99d6 100644
--- a/benchmarks/torchatari/requirements.cuda.txt
+++ b/benchmarks/torchatari/requirements.cuda.txt
@@ -7,6 +7,7 @@
 --extra-index-url https://pypi.ngc.nvidia.com
 --extra-index-url https://download.pytorch.org/whl/cu121
 --find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
+--find-links https://data.pyg.org/whl/torch-2.4.0+cu121.html
 --trusted-host pypi.ngc.nvidia.com
 
 absl-py==2.1.0
@@ -55,7 +56,7 @@ envpool==0.8.4
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/torchatari/requirements.in
-executing==1.2.0
+executing==2.1.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   varname
@@ -72,7 +73,7 @@ fsspec==2024.6.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
-giving==0.4.2
+giving==0.4.3
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   ptera
@@ -81,7 +82,7 @@ grpcio==1.66.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   tensorboard
-gym==0.23.1
+gym==0.26.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/torchatari/requirements.in
@@ -136,7 +137,7 @@ mdurl==0.1.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   markdown-it-py
-ml-dtypes==0.4.0
+ml-dtypes==0.5.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   jax
@@ -254,7 +255,7 @@ packaging==24.1
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   envpool
     #   tensorboard
-protobuf==5.28.0
+protobuf==5.28.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   tensorboard
@@ -278,7 +279,7 @@ reactivex==4.0.4
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   giving
-rich==13.8.0
+rich==13.8.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   tyro
@@ -340,7 +341,7 @@ tyro==0.8.10
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   -r benchmarks/torchatari/requirements.in
-varname==0.10.0
+varname==0.13.3
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   giving
diff --git a/benchmarks/torchatari/requirements.in b/benchmarks/torchatari/requirements.in
index c264f5563..59ca0358f 100644
--- a/benchmarks/torchatari/requirements.in
+++ b/benchmarks/torchatari/requirements.in
@@ -1,5 +1,5 @@
 envpool
-gym==0.23.1
+gym>=0.23.1
 numpy 
 torch
 tyro
diff --git a/benchmarks/torchvision/requirements.cuda.txt b/benchmarks/torchvision/requirements.cuda.txt
index dc0a16404..6b1a837f0 100644
--- a/benchmarks/torchvision/requirements.cuda.txt
+++ b/benchmarks/torchvision/requirements.cuda.txt
@@ -7,6 +7,7 @@
 --extra-index-url https://pypi.ngc.nvidia.com
 --extra-index-url https://download.pytorch.org/whl/cu121
 --find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
+--find-links https://data.pyg.org/whl/torch-2.4.0+cu121.html
 --trusted-host pypi.ngc.nvidia.com
 
 antlr4-python3-runtime==4.9.3
@@ -21,7 +22,7 @@ codefind==0.1.7
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   ptera
-executing==1.2.0
+executing==2.1.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   varname
@@ -34,7 +35,7 @@ fsspec==2024.6.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
-giving==0.4.2
+giving==0.4.3
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   ptera
@@ -75,7 +76,7 @@ mdurl==0.1.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   markdown-it-py
-ml-dtypes==0.4.0
+ml-dtypes==0.5.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   jax
@@ -203,7 +204,7 @@ reactivex==4.0.4
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   giving
-rich==13.8.0
+rich==13.8.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
@@ -248,7 +249,7 @@ typing-extensions==4.12.2
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   reactivex
     #   torch
-varname==0.10.0
+varname==0.13.3
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   giving
diff --git a/benchmarks/torchvision_ddp/requirements.cuda.txt b/benchmarks/torchvision_ddp/requirements.cuda.txt
index f2ec62699..28c6198b2 100644
--- a/benchmarks/torchvision_ddp/requirements.cuda.txt
+++ b/benchmarks/torchvision_ddp/requirements.cuda.txt
@@ -7,6 +7,7 @@
 --extra-index-url https://pypi.ngc.nvidia.com
 --extra-index-url https://download.pytorch.org/whl/cu121
 --find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
+--find-links https://data.pyg.org/whl/torch-2.4.0+cu121.html
 --trusted-host pypi.ngc.nvidia.com
 
 antlr4-python3-runtime==4.9.3
@@ -21,7 +22,7 @@ codefind==0.1.7
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   ptera
-executing==1.2.0
+executing==2.1.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   varname
@@ -34,7 +35,7 @@ fsspec==2024.6.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   torch
-giving==0.4.2
+giving==0.4.3
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   ptera
@@ -75,7 +76,7 @@ mdurl==0.1.2
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   markdown-it-py
-ml-dtypes==0.4.0
+ml-dtypes==0.5.0
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   jax
@@ -203,7 +204,7 @@ reactivex==4.0.4
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   giving
-rich==13.8.0
+rich==13.8.1
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   voir
@@ -248,7 +249,7 @@ typing-extensions==4.12.2
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   reactivex
     #   torch
-varname==0.10.0
+varname==0.13.3
     # via
     #   -c .pin/../.pin/constraints-cuda-torch.txt
     #   giving
diff --git a/benchmarks/vjepa/Makefile b/benchmarks/vjepa/Makefile
new file mode 100644
index 000000000..b701efd5e
--- /dev/null
+++ b/benchmarks/vjepa/Makefile
@@ -0,0 +1,31 @@
+# Use global base if possible
+ifndef MILABENCH_BASE
+	MILABENCH_BASE="base"
+endif
+
+export MILABENCH_BASE
+
+BENCH_NAME=vjepa
+MILABENCH_CONFIG=dev.yaml
+MILABENCH_ARGS=--config $(MILABENCH_CONFIG) --base $(MILABENCH_BASE)
+
+all:
+	install prepare single gpus nodes
+
+install:
+	milabench install $(MILABENCH_ARGS) --force
+
+prepare:
+	milabench prepare $(MILABENCH_ARGS)
+
+tests: install prepare
+	milabench run $(MILABENCH_ARGS)
+
+single:
+	milabench run $(MILABENCH_ARGS) --select $(BENCH_NAME)-single
+
+gpus:
+	milabench run $(MILABENCH_ARGS) --select $(BENCH_NAME)-gpus
+
+nodes:
+	milabench run $(MILABENCH_ARGS) --select $(BENCH_NAME)-nodes
diff --git a/benchmarks/vjepa/README.md b/benchmarks/vjepa/README.md
new file mode 100644
index 000000000..686d7a80e
--- /dev/null
+++ b/benchmarks/vjepa/README.md
@@ -0,0 +1,4 @@
+
+# Vjepa
+
+Rewrite this README to explain what the benchmark is!
diff --git a/benchmarks/vjepa/benchfile.py b/benchmarks/vjepa/benchfile.py
new file mode 100644
index 000000000..d25b47b53
--- /dev/null
+++ b/benchmarks/vjepa/benchfile.py
@@ -0,0 +1,46 @@
+from milabench.pack import Package
+
+
+
+BRANCH = "3081b0ad7b9651373ccef40c1d46b62f46cb7146"
+URL = "https://github.com/facebookresearch/jepa.git"
+
+
+class Vjepa(Package):
+    # Requirements file installed by install(). It can be empty or absent.
+    base_requirements = "requirements.in"
+
+    # The preparation script called by prepare(). It must be executable,
+    # but it can be any type of script. It can be empty or absent.
+    prepare_script = "prepare.py"
+
+    # The main script called by run(). It must be a Python file. It has to
+    # be present.
+    main_script = "main.py"
+
+    # You can remove the functions below if you don't need to modify them.
+
+    def make_env(self):
+        # Return a dict of environment variables for prepare_script and
+        # main_script.
+        return super().make_env()
+
+    async def install(self):
+        vjepa = self.dirs.code / "jepa"
+        if not vjepa.exists():
+            vjepa.clone_subtree(URL, BRANCH)
+
+        await super().install()  # super() call installs the requirements
+
+    async def prepare(self):
+        await super().prepare()  # super() call executes prepare_script
+
+    def build_run_plan(self):
+        from milabench.commands import TorchrunAllNodes
+        
+        # self.config is not the right config for this
+        plan = super().build_run_plan()
+
+        return TorchrunAllNodes(plan).use_stdout()
+
+__pack__ = Vjepa
diff --git a/benchmarks/vjepa/config/vith16.yaml b/benchmarks/vjepa/config/vith16.yaml
new file mode 100644
index 000000000..d1d5461a5
--- /dev/null
+++ b/benchmarks/vjepa/config/vith16.yaml
@@ -0,0 +1,88 @@
+app: vjepa
+nodes: 16
+tasks_per_node: 8
+data:
+  dataset_type: VideoDataset
+  datasets:
+    - /your_path_to_kinetics710_csv_file_index.csv
+  decode_one_clip: true
+  batch_size: 24
+  num_clips: 1
+  num_frames: 16
+  tubelet_size: 2
+  sampling_rate: 4
+  crop_size: 224
+  patch_size: 16
+  pin_mem: true
+  num_workers: 12
+  filter_short_videos: false
+  clip_duration: null
+data_aug:
+  auto_augment: false
+  motion_shift: false
+  random_resize_aspect_ratio:
+  - 0.75
+  - 1.35
+  random_resize_scale:
+  - 0.3
+  - 1.0
+  reprob: 0.0
+logging:
+  folder: /your_absolute_file_path_for_saving_logs_and_checkpoints/
+  write_tag: jepa
+loss:
+  loss_exp: 1.0
+  reg_coeff: 0.0
+mask:
+  - aspect_ratio:
+      - 0.75
+      - 1.5
+    num_blocks: 8
+    spatial_scale:
+      - 0.15
+      - 0.15
+    temporal_scale:
+      - 1.0
+      - 1.0
+    max_temporal_keep: 1.0
+    max_keep: null
+  - aspect_ratio:
+      - 0.75
+      - 1.5
+    num_blocks: 2
+    spatial_scale:
+      - 0.7
+      - 0.7
+    temporal_scale:
+      - 1.0
+      - 1.0
+    max_temporal_keep: 1.0
+    max_keep: null
+meta:
+  load_checkpoint: false
+  read_checkpoint: null
+  seed: 234
+  eval_freq: 100
+  use_sdpa: true
+  dtype: bfloat16
+model:
+  model_name: vit_huge
+  pred_depth: 12
+  pred_embed_dim: 384
+  uniform_power: true
+  use_mask_tokens: true
+  zero_init_mask_tokens: true
+optimization:
+  ipe: 300
+  ipe_scale: 1.25
+  clip_grad: 10.0
+  weight_decay: 0.04
+  final_weight_decay: 0.4
+  epochs: 300
+  warmup: 40
+  start_lr: 0.0002
+  lr: 0.000625
+  final_lr: 1.0e-06
+  ema:
+  - 0.998
+  - 1.0
\ No newline at end of file
diff --git a/benchmarks/vjepa/dev.yaml b/benchmarks/vjepa/dev.yaml
new file mode 100644
index 000000000..7c374e3f8
--- /dev/null
+++ b/benchmarks/vjepa/dev.yaml
@@ -0,0 +1,32 @@
+
+
+_vjepa:
+  inherits: _defaults
+  definition: .
+  install-variant: unpinned
+  install_group: torch
+
+  argv:
+    --dataset: "{milabench_data}/FakeVideo/video_metainfo.csv"
+    --output: "{milabench_extra}"
+
+vjepa-single:
+  inherits: _vjepa
+  plan:
+    method: per_gpu
+
+vjepa-gpus:
+  inherits: _vjepa
+  plan:
+    method: njobs
+    n: 1
+
+vjepa-nodes:
+  inherits: _vjepa
+  plan:
+    method: njobs
+    n: 1
+  
+  num_machines: 2
+  requires_capabilities:
+    - "len(nodes) >= ${num_machines}"
\ No newline at end of file
diff --git a/benchmarks/vjepa/main.py b/benchmarks/vjepa/main.py
new file mode 100644
index 000000000..74ca606f7
--- /dev/null
+++ b/benchmarks/vjepa/main.py
@@ -0,0 +1,656 @@
+#!/usr/bin/env python
+
+import os
+import copy
+import time
+import sys
+
+current_dir = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(os.path.join(current_dir, 'jepa'))
+
+
+import numpy as np
+import torch
+import torch.multiprocessing as mp
+import torch.nn.functional as F
+from torch.nn.parallel import DistributedDataParallel
+import torchcompat.core as acc
+
+from src.datasets.data_manager import init_data
+from src.masks.random_tube import MaskCollator as TubeMaskCollator
+from src.masks.multiblock3d import MaskCollator as MB3DMaskCollator
+from src.masks.utils import apply_masks
+from src.utils.distributed import init_distributed, AllReduce
+from src.utils.logging import (
+    CSVLogger,
+    gpu_timer,
+    get_logger,
+    grad_logger,
+    adamw_logger,
+    AverageMeter)
+from src.utils.tensors import repeat_interleave_batch
+
+from app.vjepa.utils import (
+    load_checkpoint,
+    init_video_model,
+    init_opt,
+)
+from app.vjepa.transforms import make_transforms
+
+
+# --
+log_timings = True
+log_freq = 10
+checkpoint_freq = 1
+# --
+
+_GLOBAL_SEED = 0
+np.random.seed(_GLOBAL_SEED)
+torch.manual_seed(_GLOBAL_SEED)
+torch.backends.cudnn.benchmark = True
+
+
+logger = get_logger(__name__)
+
+
+def _main(args, resume_preempt=False):
+    # ----------------------------------------------------------------------- #
+    #  PASSED IN PARAMS FROM CONFIG FILE
+    # ----------------------------------------------------------------------- #
+
+    # -- META
+    cfgs_meta = args.get('meta')
+    load_model = cfgs_meta.get('load_checkpoint') or resume_preempt
+    r_file = cfgs_meta.get('read_checkpoint', None)
+    seed = cfgs_meta.get('seed', _GLOBAL_SEED)
+    save_every_freq = cfgs_meta.get('save_every_freq', -1)
+    skip_batches = cfgs_meta.get('skip_batches', -1)
+    use_sdpa = cfgs_meta.get('use_sdpa', False)
+    which_dtype = cfgs_meta.get('dtype')
+    logger.info(f'{which_dtype=}')
+    if which_dtype.lower() == 'bfloat16':
+        dtype = torch.bfloat16
+        mixed_precision = True
+    elif which_dtype.lower() == 'float16':
+        dtype = torch.float16
+        mixed_precision = True
+    else:
+        dtype = torch.float32
+        mixed_precision = False
+
+    # -- MASK
+    cfgs_mask = args.get('mask')
+
+    # -- MODEL
+    cfgs_model = args.get('model')
+    model_name = cfgs_model.get('model_name')
+    pred_depth = cfgs_model.get('pred_depth')
+    pred_embed_dim = cfgs_model.get('pred_embed_dim')
+    uniform_power = cfgs_model.get('uniform_power', True)
+    use_mask_tokens = cfgs_model.get('use_mask_tokens', True)
+    zero_init_mask_tokens = cfgs_model.get('zero_init_mask_tokens', True)
+
+    # -- DATA
+    cfgs_data = args.get('data')
+    dataset_type = cfgs_data.get('dataset_type', 'videodataset')
+    mask_type = cfgs_data.get('mask_type', 'multiblock3d')
+    dataset_paths = cfgs_data.get('datasets', [])
+    datasets_weights = cfgs_data.get('datasets_weights', None)
+    if datasets_weights is not None:
+        assert len(datasets_weights) == len(dataset_paths), 'Must have one sampling weight specified for each dataset'
+    batch_size = cfgs_data.get('batch_size')
+    num_clips = cfgs_data.get('num_clips')
+    num_frames = cfgs_data.get('num_frames')
+    tubelet_size = cfgs_data.get('tubelet_size')
+    sampling_rate = cfgs_data.get('sampling_rate')
+    duration = cfgs_data.get('clip_duration', None)
+    crop_size = cfgs_data.get('crop_size', 224)
+    patch_size = cfgs_data.get('patch_size')
+    pin_mem = cfgs_data.get('pin_mem', False)
+    num_workers = cfgs_data.get('num_workers', 1)
+    filter_short_videos = cfgs_data.get('filter_short_videos', False)
+    decode_one_clip = cfgs_data.get('decode_one_clip', True)
+    log_resource_util_data = cfgs_data.get('log_resource_utilization', False)
+
+    # -- DATA AUGS
+    cfgs_data_aug = args.get('data_aug')
+    ar_range = cfgs_data_aug.get('random_resize_aspect_ratio', [3/4, 4/3])
+    rr_scale = cfgs_data_aug.get('random_resize_scale', [0.3, 1.0])
+    motion_shift = cfgs_data_aug.get('motion_shift', False)
+    reprob = cfgs_data_aug.get('reprob', 0.)
+    use_aa = cfgs_data_aug.get('auto_augment', False)
+
+    # -- LOSS
+    cfgs_loss = args.get('loss')
+    loss_exp = cfgs_loss.get('loss_exp')
+    reg_coeff = cfgs_loss.get('reg_coeff')
+
+    # -- OPTIMIZATION
+    cfgs_opt = args.get('optimization')
+    ipe = cfgs_opt.get('ipe', None)
+    ipe_scale = cfgs_opt.get('ipe_scale', 1.0)
+    clip_grad = cfgs_opt.get('clip_grad', None)
+    wd = float(cfgs_opt.get('weight_decay'))
+    final_wd = float(cfgs_opt.get('final_weight_decay'))
+    num_epochs = cfgs_opt.get('epochs')
+    warmup = cfgs_opt.get('warmup')
+    start_lr = cfgs_opt.get('start_lr')
+    lr = cfgs_opt.get('lr')
+    final_lr = cfgs_opt.get('final_lr')
+    ema = cfgs_opt.get('ema')
+    betas = cfgs_opt.get('betas', (0.9, 0.999))
+    eps = cfgs_opt.get('eps', 1.e-8)
+
+    # -- LOGGING
+    cfgs_logging = args.get('logging')
+    folder = cfgs_logging.get('folder')
+    tag = cfgs_logging.get('write_tag')
+
+    # ----------------------------------------------------------------------- #
+    # ----------------------------------------------------------------------- #
+
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.backends.cudnn.benchmark = True
+    try:
+        mp.set_start_method('spawn')
+    except Exception:
+        pass
+
+    # -- init torch distributed backend
+    world_size, rank = init_distributed()
+    logger.info(f'Initialized (rank/world-size) {rank}/{world_size}')
+
+    device = acc.fetch_device(int(os.getenv("LOCAL_RANK", 0)))
+    acc.set_device(device)
+        
+    # -- log/checkpointing paths
+    log_file = os.path.join(folder, f'{tag}_r{rank}.csv')
+    latest_file = f'{tag}-latest.pth.tar'
+    latest_path = os.path.join(folder, latest_file)
+    load_path = None
+    if load_model:
+        load_path = os.path.join(folder, r_file) if r_file is not None else latest_path
+        if not os.path.exists(load_path):
+            load_path = None
+            load_model = False
+
+    # -- make csv_logger
+    csv_logger = CSVLogger(
+        log_file,
+        ('%d', 'epoch'),
+        ('%d', 'itr'),
+        ('%.5f', 'loss'),
+        ('%.5f', 'loss-jepa'),
+        ('%.5f', 'reg-loss'),
+        ('%.5f', 'enc-grad-norm'),
+        ('%.5f', 'pred-grad-norm'),
+        ('%d', 'gpu-time(ms)'),
+        ('%d', 'wall-time(ms)'),
+    )
+
+    # -- init model
+    encoder, predictor = init_video_model(
+        uniform_power=uniform_power,
+        use_mask_tokens=use_mask_tokens,
+        num_mask_tokens=len(cfgs_mask),
+        zero_init_mask_tokens=zero_init_mask_tokens,
+        device=device,
+        patch_size=patch_size,
+        num_frames=num_frames,
+        tubelet_size=tubelet_size,
+        model_name=model_name,
+        crop_size=crop_size,
+        pred_depth=pred_depth,
+        pred_embed_dim=pred_embed_dim,
+        use_sdpa=use_sdpa,
+    )
+    target_encoder = copy.deepcopy(encoder)
+
+    # -- make data transforms
+    if mask_type == 'multiblock3d':
+        logger.info('Initializing basic multi-block mask')
+        mask_collator = MB3DMaskCollator(
+            crop_size=crop_size,
+            num_frames=num_frames,
+            patch_size=patch_size,
+            tubelet_size=tubelet_size,
+            cfgs_mask=cfgs_mask)
+    else:
+        logger.info('Initializing random tube mask')
+        mask_collator = TubeMaskCollator(
+            crop_size=crop_size,
+            num_frames=num_frames,
+            patch_size=patch_size,
+            tubelet_size=tubelet_size,
+            cfgs_mask=cfgs_mask)
+    transform = make_transforms(
+        random_horizontal_flip=True,
+        random_resize_aspect_ratio=ar_range,
+        random_resize_scale=rr_scale,
+        reprob=reprob,
+        auto_augment=use_aa,
+        motion_shift=motion_shift,
+        crop_size=crop_size)
+
+    # -- init data-loaders/samplers
+    (unsupervised_loader,
+     unsupervised_sampler) = init_data(
+         data=dataset_type,
+         root_path=dataset_paths,
+         batch_size=batch_size,
+         training=True,
+         clip_len=num_frames,
+         frame_sample_rate=sampling_rate,
+         filter_short_videos=filter_short_videos,
+         decode_one_clip=decode_one_clip,
+         duration=duration,
+         num_clips=num_clips,
+         transform=transform,
+         datasets_weights=datasets_weights,
+         collator=mask_collator,
+         num_workers=num_workers,
+         world_size=world_size,
+         pin_mem=pin_mem,
+         rank=rank,
+         log_dir=folder if log_resource_util_data else None)
+    try:
+        _dlen = len(unsupervised_loader)
+    except Exception:  # Different interface for webdataset
+        _dlen = unsupervised_loader.num_batches
+    if ipe is None:
+        ipe = _dlen
+    logger.info(f'iterations per epoch/dataest length: {ipe}/{_dlen}')
+
+    # Add observer here
+    def get_batch_size(batch):
+        # Tuple[[[Tensor]], [Tesnor], [Tensor]]
+        udata, _, _ = batch
+
+        # torch.Size([24, 3, 16, 224, 224])
+        return udata[0][0].shape[0]
+    
+    from benchmate.observer import BenchObserver
+    observer = BenchObserver(
+        earlystop=65,
+        batch_size_fn=get_batch_size,
+        raise_stop_program=True,
+        stdout=True,
+    )
+    unsupervised_loader = observer.iterate(unsupervised_loader)
+
+    # -- init optimizer and scheduler
+    optimizer, scaler, scheduler, wd_scheduler = init_opt(
+        encoder=encoder,
+        predictor=predictor,
+        wd=wd,
+        final_wd=final_wd,
+        start_lr=start_lr,
+        ref_lr=lr,
+        final_lr=final_lr,
+        iterations_per_epoch=ipe,
+        warmup=warmup,
+        num_epochs=num_epochs,
+        ipe_scale=ipe_scale,
+        mixed_precision=mixed_precision,
+        betas=betas,
+        eps=eps)
+    
+    if os.getenv("RANK", -1) != -1:
+        encoder = DistributedDataParallel(encoder, static_graph=True)
+        predictor = DistributedDataParallel(predictor, static_graph=True)
+        target_encoder = DistributedDataParallel(target_encoder)
+
+    for p in target_encoder.parameters():
+        p.requires_grad = False
+
+    # -- momentum schedule
+    momentum_scheduler = (ema[0] + i*(ema[1]-ema[0])/(ipe*num_epochs*ipe_scale)
+                          for i in range(int(ipe*num_epochs*ipe_scale)+1))
+
+    start_epoch = 0
+    # -- load training checkpoint
+    if load_model or os.path.exists(latest_path):
+        (
+            encoder,
+            predictor,
+            target_encoder,
+            optimizer,
+            scaler,
+            start_epoch,
+        ) = load_checkpoint(
+            r_path=load_path,
+            encoder=encoder,
+            predictor=predictor,
+            target_encoder=target_encoder,
+            opt=optimizer,
+            scaler=scaler)
+        for _ in range(start_epoch * ipe):
+            scheduler.step()
+            wd_scheduler.step()
+            next(momentum_scheduler)
+            mask_collator.step()
+
+    def save_checkpoint(epoch, path):
+        if rank != 0:
+            return
+        save_dict = {
+            'encoder': encoder.state_dict(),
+            'predictor': predictor.state_dict(),
+            'opt': optimizer.state_dict(),
+            'scaler': None if scaler is None else scaler.state_dict(),
+            'target_encoder': target_encoder.state_dict(),
+            'epoch': epoch,
+            'loss': loss_meter.avg,
+            'batch_size': batch_size,
+            'world_size': world_size,
+            'lr': lr,
+        }
+        try:
+            torch.save(save_dict, path)
+        except Exception as e:
+            logger.info(f'Encountered exception when saving checkpoint: {e}')
+
+    logger.info('Initializing loader...')
+    loader = iter(unsupervised_loader)
+
+    if skip_batches > 0:
+        logger.info(f'Skip {skip_batches} batches')
+        unsupervised_sampler.set_epoch(start_epoch)
+        for itr in range(skip_batches):
+            if itr % 10 == 0:
+                logger.info(f'Skip {itr}/{skip_batches} batches')
+            try:
+                udata = next(loader)
+            except Exception:
+                loader = iter(unsupervised_loader)
+                udata = next(loader)
+
+    next_count = 0
+
+    # -- TRAINING LOOP
+    for epoch in range(start_epoch, num_epochs):
+        logger.info('Epoch %d' % (epoch + 1))
+
+        # -- update distributed-data-loader epoch
+        unsupervised_sampler.set_epoch(epoch)
+
+        loss_meter = AverageMeter()
+        input_var_meter = AverageMeter()
+        input_var_min_meter = AverageMeter()
+        jepa_loss_meter = AverageMeter()
+        reg_loss_meter = AverageMeter()
+        mask_meters = [AverageMeter() for _ in range(len(cfgs_mask))]
+        gpu_time_meter = AverageMeter()
+        wall_time_meter = AverageMeter()
+
+        for itr in range(ipe):
+            itr_start_time = time.time()
+
+            try:
+                udata, masks_enc, masks_pred = next(loader)
+                next_count += 1
+            except StopIteration:
+                logger.info('Exhausted data loaders after %d. Refreshing...', next_count)
+                next_count = 0
+                loader = iter(unsupervised_loader)
+                udata, masks_enc, masks_pred = next(loader)
+            assert len(masks_enc) == len(masks_pred), \
+                'Currently require num encoder masks = num predictor masks'
+
+            def load_clips():
+                # -- unsupervised video clips
+                # Put each clip on the GPU and concatenate along batch
+                # dimension
+                clips = torch.cat([u.to(device, non_blocking=True) for u in udata[0]], dim=0)
+
+                # Put each mask-enc/mask-pred pair on the GPU and reuse the
+                # same mask pair for each clip
+                _masks_enc, _masks_pred = [], []
+                for _me, _mp in zip(masks_enc, masks_pred):
+                    _me = _me.to(device, non_blocking=True)
+                    _mp = _mp.to(device, non_blocking=True)
+                    _me = repeat_interleave_batch(_me, batch_size, repeat=num_clips)
+                    _mp = repeat_interleave_batch(_mp, batch_size, repeat=num_clips)
+                    _masks_enc.append(_me)
+                    _masks_pred.append(_mp)
+
+                return (clips, _masks_enc, _masks_pred)
+            clips, masks_enc, masks_pred = load_clips()
+
+            for _i, m in enumerate(mask_meters):
+                m.update(masks_enc[_i][0].size(-1))
+
+            def train_step():
+                _new_lr = scheduler.step()
+                _new_wd = wd_scheduler.step()
+                # --
+
+                def forward_target(c):
+                    """
+                    Returns list of tensors of shape [B, N, D], one for each
+                    mask-pred.
+                    """
+                    with torch.no_grad():
+                        h = target_encoder(c)
+                        h = F.layer_norm(h, (h.size(-1),))  # normalize over feature-dim  [B, N, D]
+                        # -- create targets (masked regions of h)
+                        h = apply_masks(h, masks_pred, concat=False)
+                        return h
+
+                def forward_context(c, h):
+                    """
+                    Returns list of tensors of shape [B, N, D], one for each
+                    mask-pred.
+                    """
+                    z = encoder(c, masks_enc)
+                    z = predictor(z, h, masks_enc, masks_pred)
+                    return z
+
+                def loss_fn(z, h):
+                    loss = 0.
+                    # Compute loss and accumulate for each mask-enc/mask-pred pair
+                    for zi, hi in zip(z, h):
+                        loss += torch.mean(torch.abs(zi - hi)**loss_exp) / loss_exp
+                    loss /= len(masks_pred)
+                    return loss
+
+                def reg_fn(z):
+                    return sum([torch.sqrt(zi.var(dim=1) + 0.0001) for zi in z]) / len(z)
+
+                # Step 1. Forward
+                loss_jepa, loss_reg = 0., 0.
+                with acc.amp.autocast(dtype=dtype, enabled=mixed_precision):
+                    h = forward_target(clips)
+                    z = forward_context(clips, h)
+                    loss_jepa = loss_fn(z, h)  # jepa prediction loss
+                    pstd_z = reg_fn(z)  # predictor variance across patches
+                    loss_reg += torch.mean(F.relu(1.-pstd_z))
+                loss = loss_jepa + reg_coeff * loss_reg
+
+                # Step 2. Backward & step
+                _enc_norm, _pred_norm = 0., 0.
+                if mixed_precision:
+                    scaler.scale(loss).backward()
+                    scaler.unscale_(optimizer)
+                else:
+                    loss.backward()
+                if (epoch > warmup) and (clip_grad is not None):
+                    _enc_norm = torch.nn.utils.clip_grad_norm_(encoder.parameters(), clip_grad)
+                    _pred_norm = torch.nn.utils.clip_grad_norm_(predictor.parameters(), clip_grad)
+                if mixed_precision:
+                    scaler.step(optimizer)
+                    scaler.update()
+                else:
+                    optimizer.step()
+                grad_stats = grad_logger(encoder.named_parameters())
+                grad_stats.global_norm = float(_enc_norm)
+                grad_stats_pred = grad_logger(predictor.named_parameters())
+                grad_stats_pred.global_norm = float(_pred_norm)
+                optimizer.zero_grad()
+                optim_stats = adamw_logger(optimizer)
+
+                # Step 3. momentum update of target encoder
+                m = next(momentum_scheduler)
+                with torch.no_grad():
+                    for param_q, param_k in zip(encoder.parameters(), target_encoder.parameters()):
+                        param_k.data.mul_(m).add_((1.-m) * param_q.detach().data)
+
+                return (
+                    float(loss),
+                    float(loss_jepa),
+                    float(loss_reg),
+                    _new_lr,
+                    _new_wd,
+                    grad_stats,
+                    grad_stats_pred,
+                    optim_stats,
+                )
+            (loss, loss_jepa, loss_reg, _new_lr, _new_wd, grad_stats, grad_stats_pred, optim_stats,), gpu_etime_ms = gpu_timer(train_step)
+            iter_elapsed_time_ms = (time.time() - itr_start_time) * 1000.
+            loss_meter.update(loss)
+            input_var = float(AllReduce.apply(clips.view(clips.shape[0], -1).var(dim=1).mean(dim=0)))
+            input_var_min = float(AllReduce.apply(torch.min(clips.view(clips.shape[0], -1).var(dim=1))))
+            input_var_meter.update(input_var)
+            input_var_min_meter.update(input_var_min)
+            jepa_loss_meter.update(loss_jepa)
+            reg_loss_meter.update(loss_reg)
+            gpu_time_meter.update(gpu_etime_ms)
+            wall_time_meter.update(iter_elapsed_time_ms)
+
+            observer.record_loss(loss)
+
+            # -- Logging
+            def log_stats():
+                csv_logger.log(
+                    epoch + 1,
+                    itr,
+                    loss,
+                    loss_jepa,
+                    loss_reg,
+                    grad_stats.global_norm,
+                    grad_stats_pred.global_norm,
+                    gpu_etime_ms,
+                    iter_elapsed_time_ms)
+                if (itr % log_freq == 0) or np.isnan(loss) or np.isinf(loss):
+                    logger.info(
+                        '[%d, %5d] loss: %.3f | p%.3f r%.3f | '
+                        'input_var: %.3f %.3f | '
+                        'masks: %s '
+                        '[wd: %.2e] [lr: %.2e] '
+                        '[mem: %.2e] '
+                        '[gpu: %.1f ms]'
+                        '[wall: %.1f ms]'
+                        % (epoch + 1, itr,
+                           loss_meter.avg,
+                           jepa_loss_meter.avg,
+                           reg_loss_meter.avg,
+                           input_var_meter.avg,
+                           input_var_min_meter.avg,
+                           '[' + ', '.join(['%.1f' % m.avg for m in mask_meters]) + ']',
+                           _new_wd,
+                           _new_lr,
+                           acc.max_memory_allocated() / 1024.0**2,
+                           gpu_time_meter.avg,
+                           wall_time_meter.avg))
+
+                    if optim_stats is not None:
+                        logger.info(
+                            '[%d, %5d] first moment: %.2e [%.2e %.2e] second moment: %.2e [%.2e %.2e]'
+                            % (epoch + 1, itr,
+                               optim_stats.get('exp_avg').avg,
+                               optim_stats.get('exp_avg').min,
+                               optim_stats.get('exp_avg').max,
+                               optim_stats.get('exp_avg_sq').avg,
+                               optim_stats.get('exp_avg_sq').min,
+                               optim_stats.get('exp_avg_sq').max))
+
+                    if grad_stats is not None:
+                        logger.info(
+                            '[%d, %5d] enc_grad_stats: f/l[%.2e %.2e] mn/mx(%.2e, %.2e) %.2e'
+                            % (epoch + 1, itr,
+                               grad_stats.first_layer,
+                               grad_stats.last_layer,
+                               grad_stats.min,
+                               grad_stats.max,
+                               grad_stats.global_norm))
+
+                    if grad_stats_pred is not None:
+                        logger.info(
+                            '[%d, %5d] pred_grad_stats: f/l[%.2e %.2e] mn/mx(%.2e, %.2e) %.2e'
+                            % (epoch + 1, itr,
+                               grad_stats_pred.first_layer,
+                               grad_stats_pred.last_layer,
+                               grad_stats_pred.min,
+                               grad_stats_pred.max,
+                               grad_stats_pred.global_norm))
+            log_stats()
+            assert not np.isnan(loss), 'loss is nan'
+
+        # -- Save Checkpoint
+        logger.info('avg. loss %.3f' % loss_meter.avg)
+        # -- Save Last
+        if epoch % checkpoint_freq == 0 or epoch == (num_epochs - 1):
+            save_checkpoint(epoch + 1, latest_path)
+            if save_every_freq > 0 and epoch % save_every_freq == 0:
+                save_every_file = f'{tag}-e{epoch}.pth.tar'
+                save_every_path = os.path.join(folder, save_every_file)
+                save_checkpoint(epoch + 1, save_every_path)
+
+
+
+def main():
+    from argparse import ArgumentParser
+    import torchcompat.core as acc
+    from benchmate.monitor import bench_monitor
+    from voir.phase import StopProgram
+    import yaml
+
+    parser = ArgumentParser()
+    parser.add_argument("--dataset", help="path to the csv that list all videos", type=str)
+    parser.add_argument("--output", help="path to an output directory", type=str)
+    parser.add_argument("--batch_size", type=int, default=24)
+    parser.add_argument("--num_frames", type=int, default=16)
+    parser.add_argument("--num_workers", type=int, default=12)
+    args = parser.parse_args()
+
+    # relying on environment variables is annoying in multinode setups
+    # mlbench = json.loads(os.environ["MILABENCH_CONFIG"])
+
+    configfile = os.path.join(os.path.dirname(__file__), 'config', 'vith16.yaml')
+    params = None
+    with open(configfile, 'r') as y_file:
+        params = yaml.load(y_file, Loader=yaml.FullLoader)
+        logger.info('loaded params...')
+
+    params["data"]["datasets"] = [args.dataset]
+    params["data"]["batch_size"] = args.batch_size
+    params["data"]["num_frames"] = args.num_frames
+    params["data"]["num_workers"] = args.num_workers
+
+    params["logging"]["folder"] = args.output
+
+    gpu_per_nodes = int(os.getenv("LOCAL_WORLD_SIZE", 1))
+    total_gpu = int(os.getenv("WORLD_SIZE", 1))
+    nnodes = total_gpu / gpu_per_nodes
+
+    params["nodes"] = nnodes
+    params["tasks_per_node"] = gpu_per_nodes
+
+    if os.getenv("RANK", -1) != -1:
+        acc.init_process_group()
+
+    try:
+        with bench_monitor():
+            _main(params)
+    except StopProgram:
+        pass
+
+    finally:
+        if os.getenv("RANK", -1) != -1:
+            acc.destroy_process_group()
+    
+    sys.exit(0)
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/vjepa/prepare.py b/benchmarks/vjepa/prepare.py
new file mode 100755
index 000000000..28abf23fb
--- /dev/null
+++ b/benchmarks/vjepa/prepare.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+
+import os
+import cv2
+import numpy as np
+
+def generate_random_video(output_file, width=640, height=480, num_frames=300, fps=30):
+    """
+    Generates a .mp4 video file with random content.
+
+    :param output_file: Path and name of the output video file
+    :param width: Width of the video (in pixels)
+    :param height: Height of the video (in pixels)
+    :param num_frames: Number of frames in the video
+    :param fps: Frames per second (frame rate) of the video
+    """
+    fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # Use MP4 encoding
+    video_writer = cv2.VideoWriter(output_file, fourcc, fps, (width, height))
+
+    for _ in range(num_frames):
+        frame = np.random.randint(0, 256, (height, width, 3), dtype=np.uint8)
+        video_writer.write(frame)
+
+    video_writer.release()
+
+
+if __name__ == "__main__":
+    import sys
+    import csv
+    import os
+    import tqdm
+    import multiprocessing
+
+    sys.path.append(os.path.dirname(__file__) + "/jepa/")
+    data_directory = os.environ["MILABENCH_DIR_DATA"]
+    dest = os.path.join(data_directory, "FakeVideo")
+    os.makedirs(dest, exist_ok=True)
+
+    csv_file = os.path.join(dest, "video_metainfo.csv")
+
+    num_videos = 1000  # Change this to generate more or fewer videos
+    num_frames = 300
+  
+    # Make the generation faster for the CI
+    if overrides := os.getenv("MILABENCH_TESTING_PREPARE"):
+        num1, num2 = overrides.split(",")
+        num_videos = int(num1)
+        num_frames = int(num2)
+
+    def gen_video(i):
+        output_file = os.path.join(dest, f"{i + 1}.mp4")
+        if not os.path.exists(output_file):
+            generate_random_video(output_file=output_file, width=640, height=480, num_frames=num_frames, fps=30)
+        
+    n_worker = min(multiprocessing.cpu_count(), 16)
+
+    with multiprocessing.Pool(n_worker) as pool:
+        for _ in tqdm.tqdm(pool.imap_unordered(gen_video, range(num_videos)), total=num_videos):
+            pass
+
+    with open(csv_file, mode='w', newline='') as file:
+        # CSV separated by space genius
+        writer = csv.writer(file, delimiter=" ")
+        for file in tqdm.tqdm(os.listdir(dest)):
+            if file.endswith(".mp4"):
+                writer.writerow([os.path.join(dest, file), 0])
+
+    print(f"Generated {num_videos} videos and created {csv_file}")
+
+    # If there is nothing to download or generate, just delete this file.
diff --git a/benchmarks/vjepa/requirements.cuda.txt b/benchmarks/vjepa/requirements.cuda.txt
new file mode 100644
index 000000000..c6e6ebb0e
--- /dev/null
+++ b/benchmarks/vjepa/requirements.cuda.txt
@@ -0,0 +1,356 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile --output-file=benchmarks/vjepa/requirements.cuda.txt .pin/tmp-constraints-cuda-vjepa-gpus.txt benchmarks/vjepa/requirements.in
+#
+--extra-index-url https://pypi.ngc.nvidia.com
+--extra-index-url https://download.pytorch.org/whl/cu121
+--find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
+--find-links https://data.pyg.org/whl/torch-2.4.0+cu121.html
+--trusted-host pypi.ngc.nvidia.com
+
+antlr4-python3-runtime==4.9.3
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   omegaconf
+asttokens==2.4.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   giving
+beartype==0.18.5
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -r benchmarks/vjepa/requirements.in
+braceexpand==0.1.7
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -r benchmarks/vjepa/requirements.in
+    #   webdataset
+certifi==2024.8.30
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   requests
+charset-normalizer==3.3.2
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   requests
+cloudpickle==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   submitit
+codefind==0.1.7
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   ptera
+decord==0.6.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -r benchmarks/vjepa/requirements.in
+einops==0.8.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -r benchmarks/vjepa/requirements.in
+executing==2.1.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   varname
+filelock==3.16.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   huggingface-hub
+    #   torch
+    #   triton
+fsspec==2024.6.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   huggingface-hub
+    #   torch
+giving==0.4.3
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   ptera
+    #   voir
+huggingface-hub==0.24.7
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   timm
+idna==3.10
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   requests
+jax[cuda12]==0.4.31
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -r .pin/../constraints/extra/torch.cuda.txt
+jax-cuda12-pjrt==0.4.31
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax-cuda12-plugin
+jax-cuda12-plugin[with-cuda]==0.4.31
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax
+jaxlib==0.4.31
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax
+jinja2==3.1.4
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   torch
+markdown-it-py==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   rich
+markupsafe==2.1.5
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jinja2
+mdurl==0.1.2
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   markdown-it-py
+ml-dtypes==0.5.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax
+    #   jaxlib
+mpmath==1.3.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   sympy
+networkx==3.3
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   torch
+numpy==1.26.4
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -r benchmarks/vjepa/requirements.in
+    #   decord
+    #   jax
+    #   jaxlib
+    #   ml-dtypes
+    #   opencv-python
+    #   opt-einsum
+    #   pandas
+    #   scipy
+    #   torchvision
+    #   webdataset
+    #   xformers
+nvidia-cublas-cu12==12.1.3.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax-cuda12-plugin
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax-cuda12-plugin
+    #   torch
+nvidia-cuda-nvcc-cu12==12.6.68
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax-cuda12-plugin
+nvidia-cuda-nvrtc-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   torch
+nvidia-cuda-runtime-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax-cuda12-plugin
+    #   torch
+nvidia-cudnn-cu12==9.1.0.70
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax-cuda12-plugin
+    #   torch
+nvidia-cufft-cu12==11.0.2.54
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax-cuda12-plugin
+    #   torch
+nvidia-curand-cu12==10.3.2.106
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   torch
+nvidia-cusolver-cu12==11.4.5.107
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax-cuda12-plugin
+    #   torch
+nvidia-cusparse-cu12==12.1.0.106
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax-cuda12-plugin
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-ml-py==12.560.30
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   voir
+nvidia-nccl-cu12==2.20.5
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax-cuda12-plugin
+    #   torch
+nvidia-nvjitlink-cu12==12.6.68
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax-cuda12-plugin
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+nvidia-nvtx-cu12==12.1.105
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   torch
+omegaconf==2.3.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   voir
+opencv-python==4.10.0.84
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -r benchmarks/vjepa/requirements.in
+opt-einsum==3.3.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax
+ovld==0.3.9
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   voir
+packaging==24.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   huggingface-hub
+pandas==2.2.2
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -r benchmarks/vjepa/requirements.in
+pillow==10.4.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   torchvision
+psutil==5.9.8
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   voir
+ptera==1.4.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   voir
+pygments==2.18.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   rich
+python-dateutil==2.9.0.post0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   pandas
+pytz==2024.2
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   pandas
+pyyaml==6.0.2
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -r benchmarks/vjepa/requirements.in
+    #   huggingface-hub
+    #   omegaconf
+    #   timm
+    #   webdataset
+reactivex==4.0.4
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   giving
+requests==2.32.3
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   huggingface-hub
+rich==13.8.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   voir
+safetensors==0.4.5
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   timm
+scipy==1.14.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   jax
+    #   jaxlib
+six==1.16.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   asttokens
+    #   python-dateutil
+submitit==1.5.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -r benchmarks/vjepa/requirements.in
+sympy==1.13.2
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   torch
+timm==1.0.9
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -r benchmarks/vjepa/requirements.in
+torch==2.4.0+cu121
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -r benchmarks/vjepa/requirements.in
+    #   timm
+    #   torchvision
+    #   xformers
+torchvision==0.19.0+cu121
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -r benchmarks/vjepa/requirements.in
+    #   timm
+tqdm==4.66.5
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   huggingface-hub
+triton==3.0.0
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   torch
+typing-extensions==4.12.2
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   huggingface-hub
+    #   reactivex
+    #   submitit
+    #   torch
+tzdata==2024.1
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   pandas
+urllib3==2.2.3
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   requests
+varname==0.13.3
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   giving
+voir==0.2.19
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -c .pin/../constraints/cuda.txt
+    #   -r benchmarks/vjepa/requirements.in
+webdataset==0.2.100
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -r benchmarks/vjepa/requirements.in
+xformers==0.0.27.post2
+    # via
+    #   -c .pin/../.pin/constraints-cuda-torch.txt
+    #   -r .pin/../constraints/extra/torch.cuda.txt
diff --git a/benchmarks/vjepa/requirements.in b/benchmarks/vjepa/requirements.in
new file mode 100644
index 000000000..248a1c98b
--- /dev/null
+++ b/benchmarks/vjepa/requirements.in
@@ -0,0 +1,14 @@
+voir>=0.2.19,<0.3
+torch>=2
+torchvision
+pyyaml
+numpy
+opencv-python
+submitit
+braceexpand
+webdataset
+timm
+decord
+pandas
+einops
+beartype
\ No newline at end of file
diff --git a/benchmarks/vjepa/voirfile.py b/benchmarks/vjepa/voirfile.py
new file mode 100644
index 000000000..d93f886cd
--- /dev/null
+++ b/benchmarks/vjepa/voirfile.py
@@ -0,0 +1,38 @@
+from dataclasses import dataclass
+
+from voir import configurable
+from voir.instruments import dash, early_stop, log, rate
+from benchmate.monitor import monitor_monogpu
+
+@dataclass
+class Config:
+    """voir configuration"""
+
+    # Whether to display the dash or not
+    dash: bool = False
+
+    # How often to log the rates
+    interval: str = "1s"
+
+    # Number of rates to skip before logging
+    skip: int = 5
+
+    # Number of rates to log before stopping
+    stop: int = 20
+
+    # Number of seconds between each gpu poll
+    gpu_poll: int = 3
+
+
+@configurable
+def instrument_main(ov, options: Config):
+    yield ov.phases.init
+
+    if options.dash:
+        ov.require(dash)
+
+    ov.require(
+        log("value", "progress", "rate", "units", "loss", "gpudata", context="task"),
+        early_stop(n=options.stop, key="rate", task="train"),
+        monitor_monogpu(poll_interval=options.gpu_poll),
+    )
diff --git a/benchmate/benchmate/monitor.py b/benchmate/benchmate/monitor.py
index dd8202ba1..5d2624201 100644
--- a/benchmate/benchmate/monitor.py
+++ b/benchmate/benchmate/monitor.py
@@ -44,9 +44,13 @@ def _smuggle_monitor(poll_interval=10, worker_init=None, **monitors):
     data_file = SmuggleWriter(sys.stdout)
     def mblog(data):
         nonlocal data_file
-        if data_file is not None:
-            print(json.dumps(data), file=data_file)
 
+        if data_file is not None:
+            try:
+                print(json.dumps(data), file=data_file)
+            except ValueError:
+                print("Is bench ending?, ignoring ValueError")
+    
     def get():
         t = time.time()
         entries = []
diff --git a/config/base.yaml b/config/base.yaml
index 89f113deb..28a72afb7 100644
--- a/config/base.yaml
+++ b/config/base.yaml
@@ -1,4 +1,5 @@
 _defaults:
+  enabled: false
   max_duration: 600
   voir:
     options:
@@ -32,6 +33,8 @@ _torchvision_ddp:
   definition: ../benchmarks/torchvision_ddp
   group: torchvision
   install_group: torch
+  tags:
+    - multigpu
   plan:
     method: njobs
     n: 1
@@ -52,6 +55,8 @@ _flops:
   tags:
     - diagnostic
     - flops
+    - monogpu
+    - nobatch
   
   argv:
     --number: 10
@@ -67,6 +72,8 @@ llama:
     - nlp
     - llm
     - inference
+    - monogpu
+    - nobatch
 
   voir:
     options:
@@ -192,6 +199,7 @@ resnet50:
     - classification
     - convnet
     - resnet
+    - monogpu
   
   argv:
     --model: resnet50
@@ -207,6 +215,7 @@ resnet50-noio:
     - convnet
     - resnet
     - noio
+    - monogpu
   
   argv:
     --model: resnet50
@@ -220,6 +229,7 @@ resnet152-ddp-gpus:
     - classification
     - convnet
     - resnet
+    - multigpu
   
   argv:
     --model: resnet152
@@ -234,6 +244,7 @@ _convnext_large-base:
     - classification
     - convnet
     - precision-showcase
+    - monogpu
   argv:
     --model: convnext_large
     --batch-size: 128
@@ -269,6 +280,7 @@ regnet_y_128gf:
     - convnet
     - resnet
     - lstm
+    - monogpu
   argv:
     --model: regnet_y_128gf
     --batch-size: 64
@@ -282,6 +294,7 @@ _bert-base:
     - huggingface
     - precision-showcase
     - noio
+    - monogpu
   argv:
     --model: "Bert"
     --batch-size: 32
@@ -317,6 +330,7 @@ t5:
     - transformer
     - huggingface
     - noio
+    - monogpu
   argv:
     --model: "T5"
     --batch-size: 16
@@ -329,6 +343,7 @@ reformer:
     - transformer
     - huggingface
     - noio
+    - monogpu
   argv:
     --model: "Reformer"
     --batch-size: 64
@@ -339,6 +354,7 @@ whisper:
     - audio
     - huggingface
     - noio
+    - monogpu
   argv:
     --model: "Whisper"
     --batch-size: 64
@@ -349,34 +365,19 @@ focalnet:
     - vision
     - classification
     - convnet
+    - monogpu
   plan:
     method: per_gpu
   argv:
     --model: focalnet_base_lrf
 
-super-slomo:
-  inherits: _defaults
-  tags:
-    - vision
-    - video-interpolation
-    - unet
-    - convnet
-  definition: ../benchmarks/super-slomo
-  group: super-slomo
-  install_group: torch
-  plan:
-    method: per_gpu
-  argv:
-    --train_batch_size: 64
-    --dataset_root: "{milabench_data}/FakeImageNet"
-    --loader: pytorch
-    --num_workers: "auto({n_worker}, 8)"
-
 brax:
   inherits: _defaults
   tags:
     - rl
     - jax
+    - multigpu
+    - gym
   definition: ../benchmarks/brax
   group: brax
   install_group: torch
@@ -408,11 +409,15 @@ _diffusion:
 diffusion-single:
   inherits: _diffusion
   num_machines: 1
+  tags:
+    - monogpu
   plan:
     method: per_gpu
 
 diffusion-gpus:
   inherits: _diffusion
+  tags:
+    - multigpu
   plan:
     method: njobs
     n: 1
@@ -445,12 +450,16 @@ _lightning:
 
 lightning:
   inherits: _lightning
+  tags:
+    - monogpu
   num_machines: 1
   plan:
     method: per_gpu
 
 lightning-gpus:
   inherits: _lightning
+  tags:
+    - multigpu
   num_machines: 1
   plan:
     method: njobs
@@ -476,7 +485,8 @@ dinov2-giant-single:
   inherits: _dinov2
   plan:
     method: per_gpu
-
+  tags:
+    - monogpu
   argv:
     --config-file: "{benchmark_folder}/src/dinov2/configs/train/vitg14.yaml"
     # THOSE NEED TO BE LAST
@@ -487,6 +497,8 @@ dinov2-giant-single:
 
 dinov2-giant-gpus:
   inherits: _dinov2
+  tags:
+    - multigpu
   plan:
     method: njobs
     n: 1
@@ -499,7 +511,6 @@ dinov2-giant-gpus:
     train.num_workers=10: true
 
 dinov2-giant-nodes:
-  enabled: false
   plan:
     method: njobs
     n: 1
@@ -535,6 +546,8 @@ _llm:
 
 llm-lora-single:
   inherits: _llm
+  tags:
+    - monogpu
   plan:
     method: per_gpu
   argv:
@@ -556,7 +569,8 @@ llm-lora-ddp-gpus:
   plan:
     method: njobs
     n: 1
-  
+  tags:
+    - multigpu
   argv:
     "{milabench_code}/recipes/lora_finetune_distributed.py": true
     --config: "{milabench_code}/configs/llama3_8B_lora_single_device.yaml"
@@ -600,6 +614,8 @@ llm-lora-ddp-nodes:
 
 llm-lora-mp-gpus:
   inherits: _llm
+  tags:
+    - multigpu
   plan:
     method: njobs
     n: 1
@@ -620,6 +636,8 @@ llm-lora-mp-gpus:
 
 llm-full-mp-gpus:
   inherits: _llm
+  tags:
+    - multigpu
   plan:
     method: njobs
     n: 1
@@ -667,6 +685,11 @@ llm-full-mp-nodes:
 
 _purejaxrl:
   inherits: _defaults
+  install_group: torch
+  tags:
+    - monogpu
+    - gym
+    - rl
   definition: ../benchmarks/purejaxrl
   plan:
     method: per_gpu
@@ -694,17 +717,17 @@ ppo:
 _geo_gnn:
   inherits: _defaults
   tags:
+    - monogpu
     - graph
-  # FIXME: torch cluster is laging behind pytorch
-  # we are forced to use torch==2.3 instead of torch==2.4
-  install_group: gnn
-  group: geo_gnn
+  install_group: torch
   definition: ../benchmarks/geo_gnn
   plan:
     method: per_gpu
 
 dimenet:
   inherits: _geo_gnn
+  tags:
+    - monogpu
   argv:
     --model: 'DimeNet'
     --num-samples: 10000
@@ -713,10 +736,10 @@ dimenet:
 recursiongfn:
   inherits: _defaults
   definition: ../benchmarks/recursiongfn
-  install_group: gnn
-  group: recursiongfn_gnn
+  install_group: torch
   tags:
     - graph
+    - monogpu
   plan:
     method: per_gpu
 
@@ -735,6 +758,8 @@ torchatari:
     method: per_gpu
   tags:
     - rl
+    - monogpu
+    - gym
   argv:
     --num-minibatches: 16
     --update-epochs: 4
@@ -743,34 +768,36 @@ torchatari:
     --total-timesteps: 1000000
     --env-id: Breakout-v5
 
-
-llava-single:
+_llava:
   inherits: _defaults
   definition: ../benchmarks/llava
   install_group: torch
   plan:
     method: per_gpu
-
   tags:
     - llm
+    - monogpu
   argv:
     --batch_size: 1
     --num_workers: 4
     --gradient_accumulation_steps: 1
 
-llava-gpus:
-  # This OOM
-  enabled: false
+llava-single:
+  inherits: _llava
+  plan:
+    method: per_gpu
+  argv:
+    --batch_size: 1
+    --num_workers: 4
+    --gradient_accumulation_steps: 1
 
-  inherits: _defaults
-  definition: ../benchmarks/llava
-  install_group: torch
+llava-gpus:
+  inherits: _llava
+  tags:
+    - multigpu
   plan:
     method: njobs
     n: 1
-
-  tags:
-    - llm
   argv:
     --batch_size: 1
     --num_workers: 4
@@ -780,11 +807,12 @@ llava-gpus:
 _rlhf:
   inherits: _defaults
   definition: ../benchmarks/rlhf
-  install-variant: unpinned
   install_group: torch
   plan:
     method: per_gpu
   tags:
+    - monogpu
+    - rl
     - rlhf
     - llm
   argv:
@@ -802,6 +830,35 @@ rlhf-single:
 
 rlhf-gpus:
   inherits: _rlhf
+  tags:
+    - multigpu
+  plan:
+    method: njobs
+    n: 1
+
+_vjepa:
+  inherits: _defaults
+  install_group: torch
+  definition: ../benchmarks/vjepa
+  tags:
+    - video
+  argv:
+    --batch_size: 24
+    --num_workers: "auto({n_worker}, 12)"
+    --dataset: "{milabench_data}/FakeVideo/video_metainfo.csv"
+    --output: "{milabench_extra}"
+
+vjepa-single:
+  inherits: _vjepa
+  tags:
+    - monogpu
+  plan:
+    method: per_gpu
+
+vjepa-gpus:
+  inherits: _vjepa
+  tags:
+    - multigpu
   plan:
     method: njobs
     n: 1
diff --git a/config/fast.yaml b/config/fast.yaml
new file mode 100644
index 000000000..44932e5bc
--- /dev/null
+++ b/config/fast.yaml
@@ -0,0 +1,33 @@
+#
+# Configuration for fast testing, single node
+#
+# - Flops: check that the GPUs are performing according to spec
+# - Multi GPU: checks that GPUs can talk to each other
+#
+
+include:
+  - base.yaml
+
+fp16:
+  enabled: true
+  weight: 1.0
+
+bf16:
+  enabled: true
+  weight: 1.0
+
+tf32:
+  enabled: true
+  weight: 1.0
+
+fp32:
+  enabled: true
+  weight: 1.0
+
+lightning-gpus:
+  enabled: True
+  weight: 1.0
+
+llm-lora-ddp-nodes:
+  enabled: True
+  weight: 1.0
\ No newline at end of file
diff --git a/config/high.yaml b/config/high.yaml
new file mode 100644
index 000000000..4eb43816c
--- /dev/null
+++ b/config/high.yaml
@@ -0,0 +1,9 @@
+#
+# Configuration for high fidelity
+#
+#   - Prefer original dataset over generated/fake datasets
+#   - Prefer pretrained model when possible
+#
+
+include:
+  - standard.yaml
\ No newline at end of file
diff --git a/config/medium.yaml b/config/medium.yaml
new file mode 100644
index 000000000..d52f4e47f
--- /dev/null
+++ b/config/medium.yaml
@@ -0,0 +1,9 @@
+#
+# Configuration for medium fidelity
+#
+#   - Prefer generated/fake datasets for big datasets
+#   - Prefer pretrained model when weights are small
+#
+
+include:
+  - standard.yaml
\ No newline at end of file
diff --git a/config/retired.yaml b/config/retired.yaml
index a73abf0b2..11ef99299 100644
--- a/config/retired.yaml
+++ b/config/retired.yaml
@@ -378,3 +378,27 @@ rwkv:
 dlrm:
   enabled: true
   weight: 1.0
+
+super-slomo:
+  enabled: false
+  weight: 1.0
+
+
+super-slomo:
+  inherits: _defaults
+  tags:
+    - vision
+    - video-interpolation
+    - unet
+    - convnet
+    - monogpu
+  definition: ../benchmarks/super-slomo
+  group: super-slomo
+  install_group: torch
+  plan:
+    method: per_gpu
+  argv:
+    --train_batch_size: 64
+    --dataset_root: "{milabench_data}/FakeImageNet"
+    --loader: pytorch
+    --num_workers: "auto({n_worker}, 8)"
\ No newline at end of file
diff --git a/config/scaling.yaml b/config/scaling.yaml
index 0a9907e5a..09f3f9ae5 100644
--- a/config/scaling.yaml
+++ b/config/scaling.yaml
@@ -442,3 +442,36 @@ whisper:
     128: 71634.375 MiB
     144: 80412.75 MiB
   optimized: 128
+
+
+llava-single:
+  arg: --batch_size
+  optimized: 1
+
+llava-gpus:
+  arg: --batch_size
+  optimized: 1
+
+rlhf-single:
+  arg: --per_device_train_batch_size
+  optimized: 64
+
+rlhf-gpus:
+  arg: --per_device_train_batch_size
+  optimized: 64
+
+vjepa-single:
+  arg: --batch_size
+  optimized: 24
+
+vjepa-gpus:
+  arg: --batch_size
+  optimized: 24
+
+ppo:
+  arg: --num_minibatches
+  optimized: 32
+
+dqn:
+  arg: --buffer_batch_size
+  optimized: 128
\ No newline at end of file
diff --git a/config/standard.yaml b/config/standard.yaml
index 9c3d2424b..588e35e9a 100644
--- a/config/standard.yaml
+++ b/config/standard.yaml
@@ -69,10 +69,6 @@ focalnet:
   enabled: true
   weight: 2.0
 
-super-slomo:
-  enabled: true
-  weight: 1.0
-
 fp16:
   enabled: true
   weight: 0.0
@@ -93,20 +89,42 @@ brax:
   enabled: true
   weight: 1.0
 
+# Diffusion
+diffusion-single:
+  enabled: true
+  weight: 1.0
+
 diffusion-gpus:
   enabled: True
   weight: 1.0
 
+diffusion-nodes:
+  enabled: true
+  weight: 1.0
+
+# lightning
+lightning:
+  enabled: true
+  weight: 1.0
+
 lightning-gpus:
   enabled: True
   weight: 1.0
 
+# dinov2
+dinov2-giant-single:
+  enabled: True
+  weight: 1.0
+
 dinov2-giant-gpus:
   enabled: True
   weight: 1.0
 
-# LLM
+dinov2-giant-nodes:
+  enabled: false
+  weight: 1.0
 
+# LLM
 llm-lora-single:
   enabled: True
   weight: 1.0
@@ -131,6 +149,60 @@ llm-full-mp-nodes:
   enabled: True
   weight: 1.0
 
+resnet152-ddp-gpus:
+  enabled: true
+  weight: 0.0
+
+# purejaxrl
+dqn:
+  enabled: true
+  weight: 1.0
+
+ppo:
+  enabled: true
+  weight: 1.0
+  
+# Geo
+dimenet:
+  enabled: true
+  weight: 1.0
+
+recursiongfn:
+  enabled: true
+  weight: 1.0
+
+# torchatari
+torchatari:
+  enabled: True
+  weight: 1.0
+
+# llava
+llava-single:
+  enabled: true
+  weight: 1.0
+
+llava-gpus:     # This OOM
+  enabled: false
+  weight: 1.0
+
+# rlhf
+rlhf-single:
+  enabled: True
+  weight: 1.0
+
+rlhf-gpus:
+  enabled: true
+  weight: 1.0
+
+# vjepa
+vjepa-single:
+  enabled: true
+  weight: 1.0
+
+vjepa-gpus:
+  enabled: true
+  weight: 1.0
+
 ##################
 # Disabled tests #
 ##################
diff --git a/constraints/cuda.txt b/constraints/cuda.txt
index 90219e078..eb6bbcedf 100644
--- a/constraints/cuda.txt
+++ b/constraints/cuda.txt
@@ -3,4 +3,5 @@
 #
 #
 voir >= 0.2.19
-torchcompat >= 1.0.0
\ No newline at end of file
+torchcompat >= 1.0.0
+gymnax >= 0.0.8
diff --git a/constraints/extra/gnn.cuda.txt b/constraints/extra/gnn.cuda.txt
deleted file mode 100644
index e5decec56..000000000
--- a/constraints/extra/gnn.cuda.txt
+++ /dev/null
@@ -1,4 +0,0 @@
---find-links https://data.pyg.org/whl/torch-2.3.0+cu121.html
-
-torch>=2.3.0,<2.4.0
-
diff --git a/constraints/extra/gnn.hpu.txt b/constraints/extra/gnn.hpu.txt
deleted file mode 100644
index e69de29bb..000000000
diff --git a/constraints/extra/gnn.rocm.txt b/constraints/extra/gnn.rocm.txt
deleted file mode 100644
index e69de29bb..000000000
diff --git a/constraints/extra/gnn.xpu.txt b/constraints/extra/gnn.xpu.txt
deleted file mode 100644
index e69de29bb..000000000
diff --git a/constraints/extra/torch.cuda.txt b/constraints/extra/torch.cuda.txt
index 09e3393a2..942277d5c 100644
--- a/constraints/extra/torch.cuda.txt
+++ b/constraints/extra/torch.cuda.txt
@@ -7,4 +7,8 @@ jax[cuda12]
 # --extra-index-url https://download.pytorch.org/whl/cu121
 # --find-links https://download.pytorch.org/whl/xformers/
 
-xformers==0.0.27.post2
\ No newline at end of file
+xformers==0.0.27.post2
+
+
+# Torch geometric
+--find-links https://data.pyg.org/whl/torch-2.4.0+cu121.html
diff --git a/constraints/hpu.txt b/constraints/hpu.txt
index 6313b8786..23a110bd2 100644
--- a/constraints/hpu.txt
+++ b/constraints/hpu.txt
@@ -5,4 +5,4 @@
 #
 voir >= 0.2.19
 torchcompat >= 1.0.0
-
+gymnax >= 0.0.8
\ No newline at end of file
diff --git a/constraints/rocm.txt b/constraints/rocm.txt
index 559a3f68d..b86ce00d3 100644
--- a/constraints/rocm.txt
+++ b/constraints/rocm.txt
@@ -3,4 +3,5 @@
 #
 #
 voir >= 0.2.19
-torchcompat >= 1.0.0
\ No newline at end of file
+torchcompat >= 1.0.0
+gymnax >= 0.0.8
diff --git a/constraints/xpu.txt b/constraints/xpu.txt
index 5aa7739a2..2fd966c1e 100644
--- a/constraints/xpu.txt
+++ b/constraints/xpu.txt
@@ -15,4 +15,5 @@ intel-extension-for-openxla
 #
 #
 voir >= 0.2.19
-torchcompat >= 1.0.0
\ No newline at end of file
+torchcompat >= 1.0.0
+gymnax >= 0.0.8
diff --git a/milabench/_version.py b/milabench/_version.py
index 1d8d51c47..4b49d0506 100644
--- a/milabench/_version.py
+++ b/milabench/_version.py
@@ -1,5 +1,5 @@
 """This file is generated, do not modify"""
 
-__tag__ = "v0.1.0-83-ge5505ee0"
-__commit__ = "e5505ee0c6e0fe547af149b4ca87d0d7538cdd58"
-__date__ = "2024-09-05 17:19:59 -0400"
+__tag__ = "v0.1.0-113-g9a5dfe3e"
+__commit__ = "9a5dfe3ef36e6baab6584faa3fa939e63ba2aed5"
+__date__ = "2024-09-16 09:08:28 -0400"
diff --git a/milabench/commands/__init__.py b/milabench/commands/__init__.py
index 3de44337a..e97ac4e58 100644
--- a/milabench/commands/__init__.py
+++ b/milabench/commands/__init__.py
@@ -674,8 +674,12 @@ def make_base_executor(cls, executor, *args, **kwargs):
         main_host = node_address(main)
         # add them as option so we could tweak them if necessary
         main_port = option("torchrun.port", int, default=29400)
-        backend = option("torchrun.backend", str, default="c10d")
+        backend = option("torchrun.backend", str, default="static")
+        filters = option("torchrun.local_ranks_filder", str, default="0")
 
+        if backend == "c10d":
+            print("Warning: c10d can select the wrong node for RANK=0")
+    
         main_addr = f"{main_host}:{main_port}"
 
         config = executor.pack.config
@@ -685,12 +689,27 @@ def make_base_executor(cls, executor, *args, **kwargs):
             f"--nnodes={len(nodes)}",
             f"--rdzv-backend={backend}",
             f"--rdzv-endpoint={main_addr}",
-            # f"--master-addr={main_host}",
-            # f"--master-port={main_port}",
+            f"--master-addr={main_host}",
+            f"--master-port={main_port}",
+            f"--local-ranks-filter={filters}",
             *args,
             **kwargs
         )
 
+    def make_new_node_executor(self, rank, node, base):
+        """Make a new environment and create a new executor for the node"""
+        executor: TorchrunAllGPU = super().make_new_node_executor(rank, node, base)
+
+        # Specify the node rank so rank 0 is consistently on the local node
+        new_args = list(executor.wrapper_argv) +  [
+            f"--node-rank={rank}", 
+            f"--local-addr={node['ip']}",    
+            f"--rdzv-conf=rank={rank}",
+        ]
+        executor.wrapper_argv = new_args
+
+        return executor
+    
     def __init__(self, executor: Command, *args, **kwargs) -> None:
         base_exec = TorchrunAllNodes.make_base_executor(
             TorchrunAllGPU, 
diff --git a/milabench/pack.py b/milabench/pack.py
index cbe3b2d92..1cdde0939 100644
--- a/milabench/pack.py
+++ b/milabench/pack.py
@@ -349,6 +349,7 @@ def make_env(self):
             # building an image, but it is overall nicer for development to use
             # the default cache).
             env["XDG_CACHE_HOME"] = str(self.dirs.cache)
+
         return env
 
     def full_env(self, env={}):
diff --git a/milabench/report.py b/milabench/report.py
index 7b6ccb5ed..aebcaf093 100644
--- a/milabench/report.py
+++ b/milabench/report.py
@@ -304,7 +304,9 @@ def print_meta(out, meta):
         if k == "accelerators":
             gpus = v["gpus"]
             n = len(gpus)
-            _, gpu = gpus.popitem()
+            gpu = {}
+            if n > 0:
+                _, gpu = gpus.popitem()
             stats = {
                 "n": n,
                 "product": gpu.get("product", "NA"),
@@ -325,7 +327,9 @@ def short_meta(out, meta):
         if k == "accelerators":
             gpus = v["gpus"]
             n = len(gpus)
-            _, gpu = gpus.popitem()
+            gpu = {}
+            if n > 0:
+                _, gpu = gpus.popitem()
             stats["product"] = gpu.get("product", "NA")
             stats["n_gpu"] = n
             stats["memory"] = str(gpu.get("memory", {}).get("total", 0))
@@ -486,21 +490,32 @@ def pandas_to_string(df, formatters=_formatters):
 
     columns = df.columns.tolist()
 
-    sep = " | "
-    lines = []
+    # Compute column size
     col_size = defaultdict(int)
-
     for index, row in df.iterrows():
-        line = [f"{index:<30}"]
+        col_size["bench"] = max(col_size["bench"], len(index))
         for col, val in zip(columns, row):
             fmt = formatters.get(col)
-
             if fmt is not None:
                 val = fmt(val)
                 col_size[col] = max(col_size[col], len(val))
+
+    # Generate report
+    sep = " | "
+    lines = []
+    for index, row in df.iterrows():
+        size = col_size["bench"]
+        line = [f"{index:<{size}}"]
+
+        for col, val in zip(columns, row):
+            fmt = formatters.get(col)
+            if fmt is not None:
+                val = fmt(val)
             else:
                 val = str(val)
 
+            size = col_size[col]
+            val = f"{val:>{size}}"
             line.append(val)
 
         lines.append(sep.join(line))
@@ -509,7 +524,8 @@ def fmtcol(col):
         size = col_size[col]
         return f"{col:>{size}}"
 
-    header = sep.join([f"{'bench':<30}"] + [fmtcol(col) for col in columns])
+    size = col_size["bench"]
+    header = sep.join([f"{'bench':<{size}}"] + [fmtcol(col) for col in columns])
 
     return "\n".join([header] + lines)
 
diff --git a/milabench/summary.py b/milabench/summary.py
index de9ced414..de3583f28 100644
--- a/milabench/summary.py
+++ b/milabench/summary.py
@@ -236,7 +236,8 @@ def _summarize(group, query=tuple([])) -> Summary:
         "name": config["name"],
         "group": config["group"],
         "n": len(agg["success"]),
-        "ngpu": sum(agg["ngpu"]) / len(agg["ngpu"]),
+        # In case of failure it is possible ngpu is 0 or 1
+        "ngpu": max(agg["ngpu"]),
         "successes": sum(agg["success"]),
         "failures": sum(not x for x in agg["success"]),
         "train_rate": _metrics(agg["train_rate"]),
diff --git a/milabench/system.py b/milabench/system.py
index d29f4cd27..c237baf2c 100644
--- a/milabench/system.py
+++ b/milabench/system.py
@@ -1,11 +1,11 @@
 import contextvars
+import ipaddress
 import os
 import socket
-from dataclasses import dataclass, field
-import sys
 import subprocess
+import sys
 from contextlib import contextmanager
-import ipaddress
+from dataclasses import dataclass, field
 
 import psutil
 import yaml
@@ -193,11 +193,11 @@ class Torchrun:
 
 @dataclass
 class Options:
-    sizer: SizerOptions = SizerOptions()
-    cpu: CPUOptions = CPUOptions() 
-    dataset: DatasetConfig = DatasetConfig()
-    dirs: Dirs = Dirs()
-    torchrun: Torchrun = Torchrun()
+    sizer: SizerOptions = field(default_factory=SizerOptions)
+    cpu: CPUOptions = field(default_factory=CPUOptions)
+    dataset: DatasetConfig = field(default_factory=DatasetConfig)
+    dirs: Dirs = field(default_factory=Dirs)
+    torchrun: Torchrun = field(default_factory=Torchrun)
 
 
 @dataclass
@@ -231,18 +231,19 @@ def default_device():
 @dataclass
 class SystemConfig:
     """This is meant to be an exhaustive list of all the environment overrides"""
+
     arch: str = defaultfield("gpu.arch", str, default_device())
     sshkey: str = defaultfield("ssh", str, "~/.ssh/id_rsa")
     docker_image: str = None
     nodes: list[Nodes] = field(default_factory=list)
-    gpu: GPUConfig = GPUConfig()
-    options: Options = Options()
+    gpu: GPUConfig = field(default_factory=GPUConfig)
+    options: Options = field(default_factory=Options)
 
     base: str = defaultfield("base", str, None)
     config: str = defaultfield("config", str, None)
     dash: bool = defaultfield("dash", bool, 1)
     noterm: bool = defaultfield("noterm", bool, 0)
-    github: Github = Github()
+    github: Github = field(default_factory=Github)
 
 
 def check_node_config(nodes):
diff --git a/milabench/utils.py b/milabench/utils.py
index 8495d117e..a046d3868 100644
--- a/milabench/utils.py
+++ b/milabench/utils.py
@@ -239,7 +239,8 @@ def select_nodes(nodes, n):
         else:
             ranked.append(node)
 
-    return ranked[: max(1, min(n, len(ranked)))]
+    selected = ranked[: max(1, min(n, len(ranked)))]
+    return selected
 
 
 def enumerate_rank(nodes):
diff --git a/scripts/article/run_cuda.sh b/scripts/article/run_cuda.sh
index df0cf2f92..b7b31eed3 100644
--- a/scripts/article/run_cuda.sh
+++ b/scripts/article/run_cuda.sh
@@ -8,6 +8,8 @@ export MILABENCH_BASE="$MILABENCH_WORDIR/results"
 
 export MILABENCH_VENV="$MILABENCH_WORDIR/env"
 export BENCHMARK_VENV="$MILABENCH_WORDIR/results/venv/torch"
+export MILABENCH_SIZER_SAVE="$MILABENCH_WORDIR/scaling.yaml"
+
 
 if [ -z "${MILABENCH_PREPARE}" ]; then
     export MILABENCH_PREPARE=0
@@ -40,13 +42,14 @@ install_prepare() {
 
     pip install -e $MILABENCH_SOURCE
 
+
     milabench slurm_system > $MILABENCH_WORDIR/system.yaml
 
     #
     # Install milabench's benchmarks in their venv
     #
-    pip install torch
-    milabench pin --variant cuda --from-scratch $ARGS 
+    # pip install torch
+    # milabench pin --variant cuda --from-scratch $ARGS 
     milabench install --system $MILABENCH_WORDIR/system.yaml $ARGS
 
     which pip
@@ -81,9 +84,10 @@ if [ "$MILABENCH_PREPARE" -eq 0 ]; then
 
     . $MILABENCH_WORDIR/env/bin/activate
 
-    # milabench pin --variant cuda  --from-scratch
+
+    # pip install torch
+    # milabench pin --variant cuda --from-scratch $ARGS 
     # milabench install --system $MILABENCH_WORDIR/system.yaml --force $ARGS
-    # milabench prepare --system $MILABENCH_WORDIR/system.yaml $ARGS
 
     #
     #   Run the benchmakrs
diff --git a/tests/test_command_reg/test_command_reg_one_node.txt b/tests/test_command_reg/test_command_reg_one_node.txt
index 05a286f8a..3a511bb65 100644
--- a/tests/test_command_reg/test_command_reg_one_node.txt
+++ b/tests/test_command_reg/test_command_reg_one_node.txt
@@ -16,7 +16,7 @@ export MILABENCH_DIR_RUNS=$BASE/runs
 export MILABENCH_DIR_EXTRA=$BASE/extra/llm
 export MILABENCH_DIR_CACHE=$BASE/cache
 export OMP_NUM_THREADS=0
-export MILABENCH_CONFIG='{"system": {"arch": "cuda", "sshkey": null, "nodes": [{"ip": "127.0.0.1", "main": true, "name": "0", "sshport": 22, "user": "username", "hostname": "127.0.0.1"}], "self": {"ip": "127.0.0.1", "main": true, "name": "0", "sshport": 22, "user": "username", "hostname": "127.0.0.1"}}, "dirs": {"base": "$BASE", "venv": "$BASE/venv/torch", "data": "$BASE/data", "runs": "$BASE/runs", "extra": "$BASE/extra/llm", "cache": "$BASE/cache"}, "group": "llm", "install_group": "torch", "install_variant": "cuda", "run_name": "dev", "enabled": true, "capabilities": {"nodes": 1}, "max_duration": 800, "voir": {"options": {"stop": 30, "interval": "1s"}}, "validation": {"usage": {"gpu_load_threshold": 0.5, "gpu_mem_threshold": 0.5}}, "config_base": "$SRC/milabench/config", "config_file": "$SRC/milabench/config/standard.yaml", "definition": "$SRC/milabench/benchmarks/llama", "tags": ["inference", "llm", "nlp"], "plan": {"method": "per_gpu"}, "weight": 1.0, "name": "llama", "tag": ["llama"]}'
+export MILABENCH_CONFIG='{"system": {"arch": "cuda", "sshkey": null, "nodes": [{"ip": "127.0.0.1", "main": true, "name": "0", "sshport": 22, "user": "username", "hostname": "127.0.0.1"}], "self": {"ip": "127.0.0.1", "main": true, "name": "0", "sshport": 22, "user": "username", "hostname": "127.0.0.1"}}, "dirs": {"base": "$BASE", "venv": "$BASE/venv/torch", "data": "$BASE/data", "runs": "$BASE/runs", "extra": "$BASE/extra/llm", "cache": "$BASE/cache"}, "group": "llm", "install_group": "torch", "install_variant": "cuda", "run_name": "dev", "enabled": true, "capabilities": {"nodes": 1}, "max_duration": 800, "voir": {"options": {"stop": 30, "interval": "1s"}}, "validation": {"usage": {"gpu_load_threshold": 0.5, "gpu_mem_threshold": 0.5}}, "config_base": "$SRC/milabench/config", "config_file": "$SRC/milabench/config/standard.yaml", "definition": "$SRC/milabench/benchmarks/llama", "tags": ["inference", "llm", "monogpu", "nlp", "nobatch"], "plan": {"method": "per_gpu"}, "weight": 1.0, "name": "llama", "tag": ["llama"]}'
 
 echo "---"
 echo "llama"
@@ -326,21 +326,6 @@ time (
   wait
 )
 
-echo "---"
-echo "super-slomo"
-echo "==========="
-time (
-  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/super-slomo/slomo/train.py --train_batch_size 64 --dataset_root $BASE/data/FakeImageNet --loader pytorch --num_workers 8 &
-  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/super-slomo/slomo/train.py --train_batch_size 64 --dataset_root $BASE/data/FakeImageNet --loader pytorch --num_workers 8 &
-  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/super-slomo/slomo/train.py --train_batch_size 64 --dataset_root $BASE/data/FakeImageNet --loader pytorch --num_workers 8 &
-  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/super-slomo/slomo/train.py --train_batch_size 64 --dataset_root $BASE/data/FakeImageNet --loader pytorch --num_workers 8 &
-  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/super-slomo/slomo/train.py --train_batch_size 64 --dataset_root $BASE/data/FakeImageNet --loader pytorch --num_workers 8 &
-  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/super-slomo/slomo/train.py --train_batch_size 64 --dataset_root $BASE/data/FakeImageNet --loader pytorch --num_workers 8 &
-  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/super-slomo/slomo/train.py --train_batch_size 64 --dataset_root $BASE/data/FakeImageNet --loader pytorch --num_workers 8 &
-  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/super-slomo/slomo/train.py --train_batch_size 64 --dataset_root $BASE/data/FakeImageNet --loader pytorch --num_workers 8 &
-  wait
-)
-
 echo "---"
 echo "brax"
 echo "===="
@@ -399,7 +384,7 @@ echo "---"
 echo "lightning-gpus"
 echo "=============="
 time (
-  $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
   wait
 )
 
@@ -422,7 +407,7 @@ echo "---"
 echo "dinov2-giant-gpus"
 echo "================="
 time (
-  $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-gpus/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-gpus/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
   wait
 )
 
@@ -445,7 +430,7 @@ echo "---"
 echo "llm-lora-ddp-gpus"
 echo "================="
 time (
-  $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-gpus/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 &
+  $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-gpus/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 &
   wait
 )
 
@@ -453,7 +438,7 @@ echo "---"
 echo "llm-lora-ddp-nodes"
 echo "=================="
 time (
-  $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-nodes/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 &
+  $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-nodes/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 &
   wait
 )
 
@@ -461,7 +446,7 @@ echo "---"
 echo "llm-lora-mp-gpus"
 echo "================"
 time (
-  $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_lora.yaml epochs=1 output_dir=$BASE/extra/llm-lora-mp-gpus/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ safetensors=true metric_logger.log_dir=$BASE/extra/llm-lora-mp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" batch_size=8 gradient_accumulation_steps=1 &
+  $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_lora.yaml epochs=1 output_dir=$BASE/extra/llm-lora-mp-gpus/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ safetensors=true metric_logger.log_dir=$BASE/extra/llm-lora-mp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" batch_size=8 gradient_accumulation_steps=1 &
   wait
 )
 
@@ -469,7 +454,7 @@ echo "---"
 echo "llm-full-mp-gpus"
 echo "================"
 time (
-  $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-gpus/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 &
+  $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-gpus/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 &
   wait
 )
 
@@ -477,7 +462,7 @@ echo "---"
 echo "llm-full-mp-nodes"
 echo "================="
 time (
-  $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-nodes/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 &
+  $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-nodes/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 &
   wait
 )
 
@@ -571,21 +556,6 @@ time (
   wait
 )
 
-echo "---"
-echo "rlhf_"
-echo "====="
-time (
-  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf_/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
-  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf_/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
-  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf_/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
-  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf_/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
-  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf_/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
-  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf_/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
-  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf_/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
-  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf_/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
-  wait
-)
-
 echo "---"
 echo "rlhf-single"
 echo "==========="
@@ -609,3 +579,26 @@ time (
   wait
 )
 
+echo "---"
+echo "vjepa-single"
+echo "============"
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
+  wait
+)
+
+echo "---"
+echo "vjepa-gpus"
+echo "=========="
+time (
+  $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-gpus &
+  wait
+)
+
diff --git a/tests/test_command_reg/test_command_reg_two_nodes.txt b/tests/test_command_reg/test_command_reg_two_nodes.txt
index c84460dea..3004505de 100644
--- a/tests/test_command_reg/test_command_reg_two_nodes.txt
+++ b/tests/test_command_reg/test_command_reg_two_nodes.txt
@@ -16,7 +16,7 @@ export MILABENCH_DIR_RUNS=$BASE/runs
 export MILABENCH_DIR_EXTRA=$BASE/extra/llm
 export MILABENCH_DIR_CACHE=$BASE/cache
 export OMP_NUM_THREADS=0
-export MILABENCH_CONFIG='{"system": {"arch": "cuda", "sshkey": null, "nodes": [{"ip": "127.0.0.1", "main": true, "name": "0", "sshport": 22, "user": "username", "hostname": "127.0.0.1"}, {"ip": "192.168.0.11", "main": false, "name": "1", "sshport": 22, "user": "username", "hostname": "192.168.0.11"}], "self": {"ip": "127.0.0.1", "main": true, "name": "0", "sshport": 22, "user": "username", "hostname": "127.0.0.1"}}, "dirs": {"base": "$BASE", "venv": "$BASE/venv/torch", "data": "$BASE/data", "runs": "$BASE/runs", "extra": "$BASE/extra/llm", "cache": "$BASE/cache"}, "group": "llm", "install_group": "torch", "install_variant": "cuda", "run_name": "dev", "enabled": true, "capabilities": {"nodes": 1}, "max_duration": 800, "voir": {"options": {"stop": 30, "interval": "1s"}}, "validation": {"usage": {"gpu_load_threshold": 0.5, "gpu_mem_threshold": 0.5}}, "config_base": "$SRC/milabench/config", "config_file": "$SRC/milabench/config/standard.yaml", "definition": "$SRC/milabench/benchmarks/llama", "tags": ["inference", "llm", "nlp"], "plan": {"method": "per_gpu"}, "weight": 1.0, "name": "llama", "tag": ["llama"]}'
+export MILABENCH_CONFIG='{"system": {"arch": "cuda", "sshkey": null, "nodes": [{"ip": "127.0.0.1", "main": true, "name": "0", "sshport": 22, "user": "username", "hostname": "127.0.0.1"}, {"ip": "192.168.0.11", "main": false, "name": "1", "sshport": 22, "user": "username", "hostname": "192.168.0.11"}], "self": {"ip": "127.0.0.1", "main": true, "name": "0", "sshport": 22, "user": "username", "hostname": "127.0.0.1"}}, "dirs": {"base": "$BASE", "venv": "$BASE/venv/torch", "data": "$BASE/data", "runs": "$BASE/runs", "extra": "$BASE/extra/llm", "cache": "$BASE/cache"}, "group": "llm", "install_group": "torch", "install_variant": "cuda", "run_name": "dev", "enabled": true, "capabilities": {"nodes": 1}, "max_duration": 800, "voir": {"options": {"stop": 30, "interval": "1s"}}, "validation": {"usage": {"gpu_load_threshold": 0.5, "gpu_mem_threshold": 0.5}}, "config_base": "$SRC/milabench/config", "config_file": "$SRC/milabench/config/standard.yaml", "definition": "$SRC/milabench/benchmarks/llama", "tags": ["inference", "llm", "monogpu", "nlp", "nobatch"], "plan": {"method": "per_gpu"}, "weight": 1.0, "name": "llama", "tag": ["llama"]}'
 
 echo "---"
 echo "llama"
@@ -326,21 +326,6 @@ time (
   wait
 )
 
-echo "---"
-echo "super-slomo"
-echo "==========="
-time (
-  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/super-slomo/slomo/train.py --train_batch_size 64 --dataset_root $BASE/data/FakeImageNet --loader pytorch --num_workers 8 &
-  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/super-slomo/slomo/train.py --train_batch_size 64 --dataset_root $BASE/data/FakeImageNet --loader pytorch --num_workers 8 &
-  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/super-slomo/slomo/train.py --train_batch_size 64 --dataset_root $BASE/data/FakeImageNet --loader pytorch --num_workers 8 &
-  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/super-slomo/slomo/train.py --train_batch_size 64 --dataset_root $BASE/data/FakeImageNet --loader pytorch --num_workers 8 &
-  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/super-slomo/slomo/train.py --train_batch_size 64 --dataset_root $BASE/data/FakeImageNet --loader pytorch --num_workers 8 &
-  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/super-slomo/slomo/train.py --train_batch_size 64 --dataset_root $BASE/data/FakeImageNet --loader pytorch --num_workers 8 &
-  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/super-slomo/slomo/train.py --train_batch_size 64 --dataset_root $BASE/data/FakeImageNet --loader pytorch --num_workers 8 &
-  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/super-slomo/slomo/train.py --train_batch_size 64 --dataset_root $BASE/data/FakeImageNet --loader pytorch --num_workers 8 &
-  wait
-)
-
 echo "---"
 echo "brax"
 echo "===="
@@ -400,7 +385,7 @@ echo "---"
 echo "lightning-gpus"
 echo "=============="
 time (
-  $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
+  $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/lightning/main.py --epochs 10 --num-workers 8 --loader pytorch --data $BASE/data/FakeImageNet --model resnet152 --batch-size 256 &
   wait
 )
 
@@ -423,7 +408,7 @@ echo "---"
 echo "dinov2-giant-gpus"
 echo "================="
 time (
-  $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-gpus/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
+  $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/dinov2/main.py --output-dir $BASE/extra/dinov2-giant-gpus/output --no-resume --config-file $SRC/milabench/benchmarks/dinov2/src/dinov2/configs/train/vitg14.yaml train.dataset_path=ImageNet:split=TRAIN:root=$BASE/data/FakeImageNet:extra=$BASE/data/FakeImageNet train.batch_size_per_gpu=32 train.saveckp_freq=100 train.num_workers=10 &
   wait
 )
 
@@ -446,7 +431,7 @@ echo "---"
 echo "llm-lora-ddp-gpus"
 echo "================="
 time (
-  $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-gpus/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 &
+  $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-gpus/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 &
   wait
 )
 
@@ -454,8 +439,8 @@ echo "---"
 echo "llm-lora-ddp-nodes"
 echo "=================="
 time (
-  $BASE/venv/torch/bin/tune run --nnodes=2 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-nodes/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 &
-  ssh -oCheckHostIP=no -oStrictHostKeyChecking=no -oPasswordAuthentication=no -oPasswordAuthentication=no -p 22 username@192.168.0.11 $BASE/venv/torch/bin/tune run --nnodes=2 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-nodes/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 &
+  $BASE/venv/torch/bin/tune run --nnodes=2 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --node-rank=0 --local-addr=127.0.0.1 --rdzv-conf=rank=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-nodes/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 &
+  ssh -oCheckHostIP=no -oStrictHostKeyChecking=no -oPasswordAuthentication=no -oPasswordAuthentication=no -p 22 username@192.168.0.11 $BASE/venv/torch/bin/tune run --nnodes=2 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --node-rank=1 --local-addr=192.168.0.11 --rdzv-conf=rank=1 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_8B_lora_single_device.yaml epochs=1 output_dir=$BASE/extra/llm-lora-ddp-nodes/output tokenizer.path=$BASE/data/llama3_8B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_8B/original checkpointer.output_dir=$BASE/data/llama3_8B/ metric_logger.log_dir=$BASE/extra/llm-lora-ddp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-8B" batch_size=8 gradient_accumulation_steps=8 &
   wait
 )
 
@@ -463,7 +448,7 @@ echo "---"
 echo "llm-lora-mp-gpus"
 echo "================"
 time (
-  $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_lora.yaml epochs=1 output_dir=$BASE/extra/llm-lora-mp-gpus/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ safetensors=true metric_logger.log_dir=$BASE/extra/llm-lora-mp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" batch_size=8 gradient_accumulation_steps=1 &
+  $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/lora_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_lora.yaml epochs=1 output_dir=$BASE/extra/llm-lora-mp-gpus/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ safetensors=true metric_logger.log_dir=$BASE/extra/llm-lora-mp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" batch_size=8 gradient_accumulation_steps=1 &
   wait
 )
 
@@ -471,7 +456,7 @@ echo "---"
 echo "llm-full-mp-gpus"
 echo "================"
 time (
-  $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-gpus/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 &
+  $BASE/venv/torch/bin/tune run --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-gpus/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-gpus/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 &
   wait
 )
 
@@ -479,8 +464,8 @@ echo "---"
 echo "llm-full-mp-nodes"
 echo "================="
 time (
-  $BASE/venv/torch/bin/tune run --nnodes=2 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-nodes/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 &
-  ssh -oCheckHostIP=no -oStrictHostKeyChecking=no -oPasswordAuthentication=no -oPasswordAuthentication=no -p 22 username@192.168.0.11 $BASE/venv/torch/bin/tune run --nnodes=2 --rdzv-backend=c10d --rdzv-endpoint=127.0.0.1:29400 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-nodes/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 &
+  $BASE/venv/torch/bin/tune run --nnodes=2 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --node-rank=0 --local-addr=127.0.0.1 --rdzv-conf=rank=0 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-nodes/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 &
+  ssh -oCheckHostIP=no -oStrictHostKeyChecking=no -oPasswordAuthentication=no -oPasswordAuthentication=no -p 22 username@192.168.0.11 $BASE/venv/torch/bin/tune run --nnodes=2 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --node-rank=1 --local-addr=192.168.0.11 --rdzv-conf=rank=1 --nproc-per-node=8 -- $SRC/milabench/benchmarks/llm/recipes/full_finetune_distributed.py --config $SRC/milabench/benchmarks/llm/configs/llama3_70B_full.yaml epochs=1 output_dir=$BASE/extra/llm-full-mp-nodes/output tokenizer.path=$BASE/data/llama3_70B/original/tokenizer.model checkpointer.checkpoint_dir=$BASE/data/llama3_70B checkpointer.output_dir=$BASE/data/llama3_70B/ metric_logger.log_dir=$BASE/extra/llm-full-mp-nodes/metrics repo_id="meta-llama/Meta-Llama-3.1-70B" safetensors=true batch_size=2 gradient_accumulation_steps=1 &
   wait
 )
 
@@ -574,21 +559,6 @@ time (
   wait
 )
 
-echo "---"
-echo "rlhf_"
-echo "====="
-time (
-  CUDA_VISIBLE_DEVICES=0 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf_/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
-  CUDA_VISIBLE_DEVICES=1 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf_/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
-  CUDA_VISIBLE_DEVICES=2 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf_/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
-  CUDA_VISIBLE_DEVICES=3 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf_/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
-  CUDA_VISIBLE_DEVICES=4 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf_/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
-  CUDA_VISIBLE_DEVICES=5 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf_/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
-  CUDA_VISIBLE_DEVICES=6 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf_/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
-  CUDA_VISIBLE_DEVICES=7 $SRC/milabench/benchmarks/rlhf/main.py --output_dir $BASE/extra/rlhf_/output --model_name_or_path EleutherAI/pythia-1b-deduped --per_device_train_batch_size 64 --logging_strategy no --log_level critical --bf16 &
-  wait
-)
-
 echo "---"
 echo "rlhf-single"
 echo "==========="
@@ -612,3 +582,26 @@ time (
   wait
 )
 
+echo "---"
+echo "vjepa-single"
+echo "============"
+time (
+  CUDA_VISIBLE_DEVICES=0 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
+  CUDA_VISIBLE_DEVICES=1 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
+  CUDA_VISIBLE_DEVICES=2 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
+  CUDA_VISIBLE_DEVICES=3 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
+  CUDA_VISIBLE_DEVICES=4 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
+  CUDA_VISIBLE_DEVICES=5 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
+  CUDA_VISIBLE_DEVICES=6 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
+  CUDA_VISIBLE_DEVICES=7 python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-single &
+  wait
+)
+
+echo "---"
+echo "vjepa-gpus"
+echo "=========="
+time (
+  $BASE/venv/torch/bin/benchrun --nnodes=1 --rdzv-backend=static --rdzv-endpoint=127.0.0.1:29400 --master-addr=127.0.0.1 --master-port=29400 --local-ranks-filter=0 --nproc-per-node=8 --no-python -- python $SRC/milabench/benchmarks/vjepa/main.py --batch_size 24 --num_workers 12 --dataset $BASE/data/FakeVideo/video_metainfo.csv --output $BASE/extra/vjepa-gpus &
+  wait
+)
+
diff --git a/tests/test_mock.py b/tests/test_mock.py
index 8e3944568..2e41a8388 100644
--- a/tests/test_mock.py
+++ b/tests/test_mock.py
@@ -14,7 +14,12 @@
     "llm-lora-ddp-nodes",
     "llm-lora-mp-gpus",
     "llm-full-mp-gpus",
-    "llm-full-mp-nodes"
+    "llm-full-mp-nodes",
+}
+
+
+OVERSIZED_INSTALL_BENCHMARKS = {
+
 }
 
 def run_cli(*args, expected_code=0, msg=None):
@@ -88,6 +93,9 @@ def test_milabench(monkeypatch, bench, module_tmp_dir, standard_config):
 
     monkeypatch.setenv("MILABENCH_GPU_ARCH", "cuda")
     
+    if bench in OVERSIZED_INSTALL_BENCHMARKS:
+        return
+
     with filecount_inc(module_tmp_dir, "install"):
         run_cli("install", *args, "--select", bench)
 
@@ -111,6 +119,12 @@ def test_milabench(monkeypatch, bench, module_tmp_dir, standard_config):
                 run_cli("run", *args, "--no-report", "--select", bench, "--run-name", str(bench))
 
 
+    import shutil
+    import tempfile
+    shutil.rmtree(tempfile.gettempdir(), ignore_errors=True)
+    # shutil.rmtree(module_tmp_dir)
+
+
 ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
 def cleanpath(out, tmppath):
     import subprocess
diff --git a/tests/test_summary/test_report.txt b/tests/test_summary/test_report.txt
index fe0105469..b9f6ce02a 100644
--- a/tests/test_summary/test_report.txt
+++ b/tests/test_summary/test_report.txt
@@ -5,8 +5,8 @@ Benchmark results
 
 Breakdown
 ---------
-bench                          | fail |   n | ngpu |       perf |   sem% |   std% | peak_memory |      score | weight
-benchio                        |    0 |   4 |    0 |    7979.82 |   2.9% |  17.2% |         nan |    7979.82 |   2.00
+bench   | fail |   n | ngpu |       perf |   sem% |   std% | peak_memory |      score | weight
+benchio |    0 |   4 |    0 |    7979.82 |   2.9% |  17.2% |         nan |    7979.82 |   2.00
 
 Scores
 ------
diff --git a/tests/test_summary/test_report_folder_does_average.txt b/tests/test_summary/test_report_folder_does_average.txt
index 50a4accd0..9fda7a9c2 100644
--- a/tests/test_summary/test_report_folder_does_average.txt
+++ b/tests/test_summary/test_report_folder_does_average.txt
@@ -5,8 +5,8 @@ Benchmark results
 
 Breakdown
 ---------
-bench                          | fail |   n | ngpu |       perf |   sem% |   std% | peak_memory |      score | weight
-benchio                        |    0 |   6 |    0 |    7878.45 |   2.5% |  18.0% |       24456 |    7878.45 |   2.00
+bench   | fail |   n | ngpu |       perf |   sem% |   std% | peak_memory |      score | weight
+benchio |    0 |   6 |    0 |    7878.45 |   2.5% |  18.0% |       24456 |    7878.45 |   2.00
 
 Scores
 ------