[fine_tuning] toolbox: fine_tuning_ray_fine_tuning_job: new toolbox c…

…ommand
openshift-psap · Oct 23, 2024 · 909f75e · 909f75e
1 parent 7551839
commit 909f75e
Show file tree

Hide file tree

Showing 32 changed files with 3,245 additions and 0 deletions.
diff --git a/docs/toolbox.generated/Fine_Tuning.ray_fine_tuning_job.rst b/docs/toolbox.generated/Fine_Tuning.ray_fine_tuning_job.rst
@@ -0,0 +1,147 @@
+:orphan:
+
+..
+    _Auto-generated file, do not edit manually ...
+    _Toolbox generate command: repo generate_toolbox_rst_documentation
+    _ Source component: Fine_Tuning.ray_fine_tuning_job
+
+
+fine_tuning ray_fine_tuning_job
+===============================
+
+Run a simple Ray fine-tuning Job.
+
+
+
+
+Parameters
+----------
+
+
+``name``  
+
+* The name of the fine-tuning job to create
+
+
+``namespace``  
+
+* The name of the namespace where the scheduler load will be generated
+
+
+``pvc_name``  
+
+* The name of the PVC where the model and dataset are stored
+
+
+``model_name``  
+
+* The name of the model to use inside the /dataset directory of the PVC
+
+
+``ft_scripts_dir``  
+
+* Directory where the fine-tuning scripts are stored
+
+
+``dataset_name``  
+
+* The name of the dataset to use inside the /model directory of the PVC
+
+
+``dataset_replication``  
+
+* Number of replications of the dataset to use, to artificially extend or reduce the fine-tuning effort
+
+* default value: ``1``
+
+
+``dataset_transform``  
+
+* Name of the transformation to apply to the dataset
+
+
+``dataset_prefer_cache``  
+
+* If True, and the dataset has to be transformed/duplicated, save and/or load it from the PVC
+
+* default value: ``True``
+
+
+``dataset_prepare_cache_only``  
+
+* If True, only prepare the dataset cache file and do not run the fine-tuning.
+
+
+``dataset_response_template``  
+
+* The delimiter marking the beginning of the response in the dataset samples
+
+* default value: ``\n### Label:``
+
+
+``container_image``  
+
+* The image to use for the fine-tuning container
+
+* default value: ``quay.io/rhoai/ray:2.35.0-py39-cu121-torch24-fa26``
+
+
+``ray_version``  
+
+* The version identifier passed to the RayCluster object
+
+* default value: ``2.35.0``
+
+
+``gpu``  
+
+* The number of GPUs to request for the fine-tuning job
+
+* default value: ``1``
+
+
+``memory``  
+
+* The number of RAM gigs to request for to the fine-tuning job (in Gigs)
+
+* default value: ``10``
+
+
+``cpu``  
+
+* The number of CPU cores to request for the fine-tuning job (in cores)
+
+* default value: ``1``
+
+
+``request_equals_limits``  
+
+* If True, sets the 'limits' of the job with the same value as the request.
+
+
+``prepare_only``  
+
+* If True, only prepare the environment but do not run the fine-tuning job.
+
+
+``delete_other``  
+
+* If True, delete the other PyTorchJobs before running
+
+
+``worker_replicas``  
+
+* Number of worker replicas to deploy
+
+* default value: ``2``
+
+
+``hyper_parameters``  
+
+* Dictionnary of hyper-parameters to pass to sft-trainer
+
+
+``sleep_forever``  
+
+* If true, sleeps forever instead of running the fine-tuning command.
+
diff --git a/docs/toolbox.generated/index.rst b/docs/toolbox.generated/index.rst
@@ -80,6 +80,7 @@ Toolbox Documentation
     
 
                 
+* :doc:`ray_fine_tuning_job <Fine_Tuning.ray_fine_tuning_job>`	 Run a simple Ray fine-tuning Job.
 * :doc:`run_fine_tuning_job <Fine_Tuning.run_fine_tuning_job>`	 Run a simple fine-tuning Job.
 * :doc:`run_quality_evaluation <Fine_Tuning.run_quality_evaluation>`	 Run a simple fine-tuning Job.
 

diff --git a/projects/fine_tuning/toolbox/fine_tuning.py b/projects/fine_tuning/toolbox/fine_tuning.py
@@ -123,3 +123,74 @@ def run_quality_evaluation(
         """
 
         return RunAnsibleRole(locals())
+
+
+    @AnsibleRole("fine_tuning_ray_fine_tuning_job")
+    @AnsibleMappedParams
+    def ray_fine_tuning_job(
+            self,
+            name,
+            namespace,
+            pvc_name,
+
+            model_name,
+            ft_scripts_dir,
+
+            dataset_name,
+            dataset_replication=1,
+            dataset_transform=None,
+            dataset_prefer_cache=True,
+            dataset_prepare_cache_only=False,
+            dataset_response_template="\n### Label:",
+            container_image="quay.io/rhoai/ray:2.35.0-py39-cu121-torch24-fa26",
+            ray_version="2.35.0",
+            gpu=1,
+            memory=10,
+            cpu=1,
+            request_equals_limits=False,
+
+            prepare_only=False,
+            delete_other=False,
+
+            worker_replicas=2,
+
+            hyper_parameters={},
+
+            sleep_forever=False,
+    ):
+        """
+        Run a simple Ray fine-tuning Job.
+
+        Args:
+          name: the name of the fine-tuning job to create
+          namespace: the name of the namespace where the scheduler load will be generated
+          pvc_name: the name of the PVC where the model and dataset are stored
+
+          model_name: the name of the model to use inside the /dataset directory of the PVC
+
+          ft_scripts_dir: directory where the fine-tuning scripts are stored
+
+          dataset_name: the name of the dataset to use inside the /model directory of the PVC
+          dataset_replication: number of replications of the dataset to use, to artificially extend or reduce the fine-tuning effort
+          dataset_transform: name of the transformation to apply to the dataset
+          dataset_prefer_cache: if True, and the dataset has to be transformed/duplicated, save and/or load it from the PVC
+          dataset_prepare_cache_only: if True, only prepare the dataset cache file and do not run the fine-tuning.
+          dataset_response_template: the delimiter marking the beginning of the response in the dataset samples
+          container_image: the image to use for the fine-tuning container
+          gpu: the number of GPUs to request for the fine-tuning job
+          memory: the number of RAM gigs to request for to the fine-tuning job (in Gigs)
+          cpu: the number of CPU cores to request for the fine-tuning job (in cores)
+          request_equals_limits: if True, sets the 'limits' of the job with the same value as the request.
+
+          prepare_only: if True, only prepare the environment but do not run the fine-tuning job.
+          delete_other: if True, delete the other PyTorchJobs before running
+
+          worker_replicas: number of worker replicas to deploy
+
+          hyper_parameters: dictionnary of hyper-parameters to pass to sft-trainer
+
+          sleep_forever: if true, sleeps forever instead of running the fine-tuning command.
+          ray_version: the version identifier passed to the RayCluster object
+        """
+
+        return RunAnsibleRole(locals())
diff --git a/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/defaults/main/config.yml b/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/defaults/main/config.yml
@@ -0,0 +1,78 @@
+# Auto-generated file, do not edit manually ...
+# Toolbox generate command: repo generate_ansible_default_settings
+# Source component: Fine_Tuning.ray_fine_tuning_job
+
+# Parameters
+# the name of the fine-tuning job to create
+# Mandatory value
+fine_tuning_ray_fine_tuning_job_name:
+
+# the name of the namespace where the scheduler load will be generated
+# Mandatory value
+fine_tuning_ray_fine_tuning_job_namespace:
+
+# the name of the PVC where the model and dataset are stored
+# Mandatory value
+fine_tuning_ray_fine_tuning_job_pvc_name:
+
+# the name of the model to use inside the /dataset directory of the PVC
+# Mandatory value
+fine_tuning_ray_fine_tuning_job_model_name:
+
+# directory where the fine-tuning scripts are stored
+# Mandatory value
+fine_tuning_ray_fine_tuning_job_ft_scripts_dir:
+
+# the name of the dataset to use inside the /model directory of the PVC
+# Mandatory value
+fine_tuning_ray_fine_tuning_job_dataset_name:
+
+# number of replications of the dataset to use, to artificially extend or reduce the fine-tuning effort
+fine_tuning_ray_fine_tuning_job_dataset_replication: 1
+
+# name of the transformation to apply to the dataset
+fine_tuning_ray_fine_tuning_job_dataset_transform: null
+
+# if True, and the dataset has to be transformed/duplicated, save and/or load it from the PVC
+fine_tuning_ray_fine_tuning_job_dataset_prefer_cache: true
+
+# if True, only prepare the dataset cache file and do not run the fine-tuning.
+fine_tuning_ray_fine_tuning_job_dataset_prepare_cache_only: false
+
+# the delimiter marking the beginning of the response in the dataset samples
+fine_tuning_ray_fine_tuning_job_dataset_response_template: '
+
+  ### Label:'
+
+# the image to use for the fine-tuning container
+fine_tuning_ray_fine_tuning_job_container_image: quay.io/rhoai/ray:2.35.0-py39-cu121-torch24-fa26
+
+# the version identifier passed to the RayCluster object
+fine_tuning_ray_fine_tuning_job_ray_version: 2.35.0
+
+# the number of GPUs to request for the fine-tuning job
+fine_tuning_ray_fine_tuning_job_gpu: 1
+
+# the number of RAM gigs to request for to the fine-tuning job (in Gigs)
+fine_tuning_ray_fine_tuning_job_memory: 10
+
+# the number of CPU cores to request for the fine-tuning job (in cores)
+fine_tuning_ray_fine_tuning_job_cpu: 1
+
+# if True, sets the 'limits' of the job with the same value as the request.
+fine_tuning_ray_fine_tuning_job_request_equals_limits: false
+
+# if True, only prepare the environment but do not run the fine-tuning job.
+fine_tuning_ray_fine_tuning_job_prepare_only: false
+
+# if True, delete the other PyTorchJobs before running
+fine_tuning_ray_fine_tuning_job_delete_other: false
+
+# number of worker replicas to deploy
+fine_tuning_ray_fine_tuning_job_worker_replicas: 2
+
+# dictionnary of hyper-parameters to pass to sft-trainer
+fine_tuning_ray_fine_tuning_job_hyper_parameters: {}
+
+# if true, sleeps forever instead of running the fine-tuning command.
+fine_tuning_ray_fine_tuning_job_sleep_forever: false
diff --git a/...ts/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/files/entrypoint/convert_alpaca.py b/...ts/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/files/entrypoint/convert_alpaca.py
@@ -0,0 +1,36 @@
+import sys
+import pathlib
+
+import datasets
+
+PROMPT_DICT = {
+    "prompt_input": (
+        "Below is an instruction that describes a task, paired with an input that provides further context. "
+        "Write a response that appropriately completes the request.\n\n"
+        "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Label:"
+    ),
+    "prompt_no_input": (
+        "Below is an instruction that describes a task. "
+        "Write a response that appropriately completes the request.\n\n"
+        "### Instruction:\n{instruction}\n\n### Label:"
+    ),
+}
+
+src = pathlib.Path(sys.argv[1])
+dest = pathlib.Path(sys.argv[2])
+
+def format_alpaca_fn(example):
+    prompt_input, prompt_no_input = PROMPT_DICT['prompt_input'], PROMPT_DICT['prompt_no_input']
+    output = prompt_input.format_map(example) if example.get("input", "") != "" else prompt_no_input.format_map(example)
+    output = f"{output} {example['output']}"
+    return {"output": output}
+
+
+print(f"Converting {src} from Alpaca format to SFTTrainer ...")
+ds = datasets.load_dataset('json', data_files=str(src))
+
+alpaca_ds = ds['train'].map(format_alpaca_fn, remove_columns=['instruction', 'input'])
+
+print(f"Saving into {dest} ...")
+
+alpaca_ds.to_json(dest)