Skip to content

Commit

Permalink
Allow a very long user program
Browse files Browse the repository at this point in the history
  • Loading branch information
Michaelvll committed May 6, 2024
1 parent 2ff95bb commit 6afd204
Showing 1 changed file with 25 additions and 0 deletions.
25 changes: 25 additions & 0 deletions sky/backends/cloud_vm_ray_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -3144,6 +3144,31 @@ def _exec_code_on_head(

code = job_lib.JobLibCodeGen.queue_job(job_id, job_submit_cmd)
job_submit_cmd = ' && '.join([mkdir_code, create_script_code, code])
if len(job_submit_cmd) > 120 * 1024:
# The maximum size of a command line argument is 128 KB. We use
# 120KB to be safe for other arguments.
# https://github.com/torvalds/linux/blob/master/include/uapi/linux/binfmts.h
# If the command is too long, write it to a file, rsync and execute
# it.
ssh_credentials = backend_utils.ssh_credential_from_yaml(
handle.cluster_yaml, handle.docker_user, handle.ssh_user)
head_ssh_port = handle.head_ssh_port
runner = command_runner.SSHCommandRunner(handle.head_ip,
port=head_ssh_port,
**ssh_credentials)
with tempfile.NamedTemporaryFile('w', prefix='sky_app_') as fp:
fp.write(codegen)
fp.flush()
script_path = os.path.join(SKY_REMOTE_APP_DIR, f'sky_job_{job_id}')
# We choose to sync code + exec, because the alternative of 'ray
# submit' may not work as it may use system python (python2) to
# execute the script. Happens for AWS.
runner.rsync(source=fp.name,
target=script_path,
up=True,
stream_logs=False)
job_submit_cmd = f'{mkdir_code} && {code}'


if managed_job_dag is not None:
# Add the managed job to job queue database.
Expand Down

0 comments on commit 6afd204

Please sign in to comment.