-
Notifications
You must be signed in to change notification settings - Fork 16
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[fine_tuning] toolbox: fine_tuning_ray_fine_tuning_job: new toolbox c…
…ommand
- Loading branch information
Showing
32 changed files
with
3,245 additions
and
0 deletions.
There are no files selected for viewing
147 changes: 147 additions & 0 deletions
147
docs/toolbox.generated/Fine_Tuning.ray_fine_tuning_job.rst
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,147 @@ | ||
:orphan: | ||
|
||
.. | ||
_Auto-generated file, do not edit manually ... | ||
_Toolbox generate command: repo generate_toolbox_rst_documentation | ||
_ Source component: Fine_Tuning.ray_fine_tuning_job | ||
|
||
fine_tuning ray_fine_tuning_job | ||
=============================== | ||
|
||
Run a simple Ray fine-tuning Job. | ||
|
||
|
||
|
||
|
||
Parameters | ||
---------- | ||
|
||
|
||
``name`` | ||
|
||
* The name of the fine-tuning job to create | ||
|
||
|
||
``namespace`` | ||
|
||
* The name of the namespace where the scheduler load will be generated | ||
|
||
|
||
``pvc_name`` | ||
|
||
* The name of the PVC where the model and dataset are stored | ||
|
||
|
||
``model_name`` | ||
|
||
* The name of the model to use inside the /dataset directory of the PVC | ||
|
||
|
||
``ft_scripts_dir`` | ||
|
||
* Directory where the fine-tuning scripts are stored | ||
|
||
|
||
``dataset_name`` | ||
|
||
* The name of the dataset to use inside the /model directory of the PVC | ||
|
||
|
||
``dataset_replication`` | ||
|
||
* Number of replications of the dataset to use, to artificially extend or reduce the fine-tuning effort | ||
|
||
* default value: ``1`` | ||
|
||
|
||
``dataset_transform`` | ||
|
||
* Name of the transformation to apply to the dataset | ||
|
||
|
||
``dataset_prefer_cache`` | ||
|
||
* If True, and the dataset has to be transformed/duplicated, save and/or load it from the PVC | ||
|
||
* default value: ``True`` | ||
|
||
|
||
``dataset_prepare_cache_only`` | ||
|
||
* If True, only prepare the dataset cache file and do not run the fine-tuning. | ||
|
||
|
||
``dataset_response_template`` | ||
|
||
* The delimiter marking the beginning of the response in the dataset samples | ||
|
||
* default value: ``\n### Label:`` | ||
|
||
|
||
``container_image`` | ||
|
||
* The image to use for the fine-tuning container | ||
|
||
* default value: ``quay.io/rhoai/ray:2.35.0-py39-cu121-torch24-fa26`` | ||
|
||
|
||
``ray_version`` | ||
|
||
* The version identifier passed to the RayCluster object | ||
|
||
* default value: ``2.35.0`` | ||
|
||
|
||
``gpu`` | ||
|
||
* The number of GPUs to request for the fine-tuning job | ||
|
||
* default value: ``1`` | ||
|
||
|
||
``memory`` | ||
|
||
* The number of RAM gigs to request for to the fine-tuning job (in Gigs) | ||
|
||
* default value: ``10`` | ||
|
||
|
||
``cpu`` | ||
|
||
* The number of CPU cores to request for the fine-tuning job (in cores) | ||
|
||
* default value: ``1`` | ||
|
||
|
||
``request_equals_limits`` | ||
|
||
* If True, sets the 'limits' of the job with the same value as the request. | ||
|
||
|
||
``prepare_only`` | ||
|
||
* If True, only prepare the environment but do not run the fine-tuning job. | ||
|
||
|
||
``delete_other`` | ||
|
||
* If True, delete the other PyTorchJobs before running | ||
|
||
|
||
``worker_replicas`` | ||
|
||
* Number of worker replicas to deploy | ||
|
||
* default value: ``2`` | ||
|
||
|
||
``hyper_parameters`` | ||
|
||
* Dictionnary of hyper-parameters to pass to sft-trainer | ||
|
||
|
||
``sleep_forever`` | ||
|
||
* If true, sleeps forever instead of running the fine-tuning command. | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
78 changes: 78 additions & 0 deletions
78
projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/defaults/main/config.yml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
# Auto-generated file, do not edit manually ... | ||
# Toolbox generate command: repo generate_ansible_default_settings | ||
# Source component: Fine_Tuning.ray_fine_tuning_job | ||
|
||
# Parameters | ||
# the name of the fine-tuning job to create | ||
# Mandatory value | ||
fine_tuning_ray_fine_tuning_job_name: | ||
|
||
# the name of the namespace where the scheduler load will be generated | ||
# Mandatory value | ||
fine_tuning_ray_fine_tuning_job_namespace: | ||
|
||
# the name of the PVC where the model and dataset are stored | ||
# Mandatory value | ||
fine_tuning_ray_fine_tuning_job_pvc_name: | ||
|
||
# the name of the model to use inside the /dataset directory of the PVC | ||
# Mandatory value | ||
fine_tuning_ray_fine_tuning_job_model_name: | ||
|
||
# directory where the fine-tuning scripts are stored | ||
# Mandatory value | ||
fine_tuning_ray_fine_tuning_job_ft_scripts_dir: | ||
|
||
# the name of the dataset to use inside the /model directory of the PVC | ||
# Mandatory value | ||
fine_tuning_ray_fine_tuning_job_dataset_name: | ||
|
||
# number of replications of the dataset to use, to artificially extend or reduce the fine-tuning effort | ||
fine_tuning_ray_fine_tuning_job_dataset_replication: 1 | ||
|
||
# name of the transformation to apply to the dataset | ||
fine_tuning_ray_fine_tuning_job_dataset_transform: null | ||
|
||
# if True, and the dataset has to be transformed/duplicated, save and/or load it from the PVC | ||
fine_tuning_ray_fine_tuning_job_dataset_prefer_cache: true | ||
|
||
# if True, only prepare the dataset cache file and do not run the fine-tuning. | ||
fine_tuning_ray_fine_tuning_job_dataset_prepare_cache_only: false | ||
|
||
# the delimiter marking the beginning of the response in the dataset samples | ||
fine_tuning_ray_fine_tuning_job_dataset_response_template: ' | ||
### Label:' | ||
|
||
# the image to use for the fine-tuning container | ||
fine_tuning_ray_fine_tuning_job_container_image: quay.io/rhoai/ray:2.35.0-py39-cu121-torch24-fa26 | ||
|
||
# the version identifier passed to the RayCluster object | ||
fine_tuning_ray_fine_tuning_job_ray_version: 2.35.0 | ||
|
||
# the number of GPUs to request for the fine-tuning job | ||
fine_tuning_ray_fine_tuning_job_gpu: 1 | ||
|
||
# the number of RAM gigs to request for to the fine-tuning job (in Gigs) | ||
fine_tuning_ray_fine_tuning_job_memory: 10 | ||
|
||
# the number of CPU cores to request for the fine-tuning job (in cores) | ||
fine_tuning_ray_fine_tuning_job_cpu: 1 | ||
|
||
# if True, sets the 'limits' of the job with the same value as the request. | ||
fine_tuning_ray_fine_tuning_job_request_equals_limits: false | ||
|
||
# if True, only prepare the environment but do not run the fine-tuning job. | ||
fine_tuning_ray_fine_tuning_job_prepare_only: false | ||
|
||
# if True, delete the other PyTorchJobs before running | ||
fine_tuning_ray_fine_tuning_job_delete_other: false | ||
|
||
# number of worker replicas to deploy | ||
fine_tuning_ray_fine_tuning_job_worker_replicas: 2 | ||
|
||
# dictionnary of hyper-parameters to pass to sft-trainer | ||
fine_tuning_ray_fine_tuning_job_hyper_parameters: {} | ||
|
||
# if true, sleeps forever instead of running the fine-tuning command. | ||
fine_tuning_ray_fine_tuning_job_sleep_forever: false |
36 changes: 36 additions & 0 deletions
36
...ts/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/files/entrypoint/convert_alpaca.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
import sys | ||
import pathlib | ||
|
||
import datasets | ||
|
||
PROMPT_DICT = { | ||
"prompt_input": ( | ||
"Below is an instruction that describes a task, paired with an input that provides further context. " | ||
"Write a response that appropriately completes the request.\n\n" | ||
"### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Label:" | ||
), | ||
"prompt_no_input": ( | ||
"Below is an instruction that describes a task. " | ||
"Write a response that appropriately completes the request.\n\n" | ||
"### Instruction:\n{instruction}\n\n### Label:" | ||
), | ||
} | ||
|
||
src = pathlib.Path(sys.argv[1]) | ||
dest = pathlib.Path(sys.argv[2]) | ||
|
||
def format_alpaca_fn(example): | ||
prompt_input, prompt_no_input = PROMPT_DICT['prompt_input'], PROMPT_DICT['prompt_no_input'] | ||
output = prompt_input.format_map(example) if example.get("input", "") != "" else prompt_no_input.format_map(example) | ||
output = f"{output} {example['output']}" | ||
return {"output": output} | ||
|
||
|
||
print(f"Converting {src} from Alpaca format to SFTTrainer ...") | ||
ds = datasets.load_dataset('json', data_files=str(src)) | ||
|
||
alpaca_ds = ds['train'].map(format_alpaca_fn, remove_columns=['instruction', 'input']) | ||
|
||
print(f"Saving into {dest} ...") | ||
|
||
alpaca_ds.to_json(dest) |
Oops, something went wrong.