diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index f916d931b5f..fab05877386 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -3144,6 +3144,31 @@ def _exec_code_on_head( code = job_lib.JobLibCodeGen.queue_job(job_id, job_submit_cmd) job_submit_cmd = ' && '.join([mkdir_code, create_script_code, code]) + if len(job_submit_cmd) > 120 * 1024: + # The maximum size of a command line argument is 128 KB. We use + # 120KB to be safe for other arguments. + # https://github.com/torvalds/linux/blob/master/include/uapi/linux/binfmts.h + # If the command is too long, write it to a file, rsync and execute + # it. + ssh_credentials = backend_utils.ssh_credential_from_yaml( + handle.cluster_yaml, handle.docker_user, handle.ssh_user) + head_ssh_port = handle.head_ssh_port + runner = command_runner.SSHCommandRunner(handle.head_ip, + port=head_ssh_port, + **ssh_credentials) + with tempfile.NamedTemporaryFile('w', prefix='sky_app_') as fp: + fp.write(codegen) + fp.flush() + script_path = os.path.join(SKY_REMOTE_APP_DIR, f'sky_job_{job_id}') + # We choose to sync code + exec, because the alternative of 'ray + # submit' may not work as it may use system python (python2) to + # execute the script. Happens for AWS. + runner.rsync(source=fp.name, + target=script_path, + up=True, + stream_logs=False) + job_submit_cmd = f'{mkdir_code} && {code}' + if managed_job_dag is not None: # Add the managed job to job queue database.