Skip to content

Commit

Permalink
htcondorcern: fix Singularity and Kerberos tokens
Browse files Browse the repository at this point in the history
Fixes CERN HTCondor compute backend for Singularity unpacked images
execution mode where jobs couldn't access restricted EOS directories due
to inaccessible Kerberos credentials.
  • Loading branch information
tiborsimko committed May 4, 2022
1 parent 168006d commit dce0804
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 11 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# This file is part of REANA.
# Copyright (C) 2017, 2018, 2019, 2020, 2021 CERN.
# Copyright (C) 2017, 2018, 2019, 2020, 2021, 2022 CERN.
#
# REANA is free software; you can redistribute it and/or modify it
# under the terms of the MIT License; see LICENSE file for more details.
Expand Down
29 changes: 20 additions & 9 deletions reana_job_controller/htcondorcern_job_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ def execute(self):
job_ad["MaxRunTime"] = 3600
if self.htcondor_accounting_group:
job_ad["AccountingGroup"] = self.htcondor_accounting_group
job_ad["MY.SendCredential"] = True
future = current_app.htcondor_executor.submit(self._submit, job_ad)
clusterid = future.result()
return clusterid
Expand Down Expand Up @@ -229,12 +230,15 @@ def _copy_wrapper_file(self):
)
else:
template = (
"#!/bin/bash \n"
"#!/bin/bash\n"
'SINGULARITY_KRB5CCNAME="FILE:/srv/$(basename $KRB5CCNAME)"\n'
"singularity exec "
"--contain "
"--ipc "
"--pid "
"--home $PWD:/srv "
"--bind $PWD:/srv "
"--bind /cvmfs "
"--bind /eos "
"--env KRB5CCNAME=$SINGULARITY_KRB5CCNAME "
"{DOCKER_IMG} {CMD}".format(
DOCKER_IMG=self.docker_img,
CMD=self._format_arguments() + " | bash",
Expand All @@ -253,15 +257,15 @@ def _copy_wrapper_file(self):
def _submit(self, job_ad):
"""Execute submission transaction."""
ads = []
schedd = HTCondorJobManagerCERN._get_schedd()
schedd, credd = HTCondorJobManagerCERN._get_schedd()
logging.info("Submiting job - {}".format(job_ad))
clusterid = schedd.submit(job_ad, 1, True, ads)
HTCondorJobManagerCERN._spool_input(ads)
return clusterid

@retry(stop_max_attempt_number=MAX_NUM_RETRIES, wait_fixed=RETRY_WAIT_TIME)
def _spool_input(ads):
schedd = HTCondorJobManagerCERN._get_schedd()
schedd, credd = HTCondorJobManagerCERN._get_schedd()
logging.info("Spooling job inputs - {}".format(ads))
schedd.spool(ads)

Expand All @@ -274,12 +278,19 @@ def _get_schedd():
thread_local, "MONITOR_THREAD_SCHEDD", htcondor.Schedd() # noqa: F821
)
logging.info("Getting schedd: {}".format(thread_local.MONITOR_THREAD_SCHEDD))
return thread_local.MONITOR_THREAD_SCHEDD
credd = getattr(thread_local, "MONITOR_THREAD_CREDD", None)
if credd is None:
setattr(
thread_local, "MONITOR_THREAD_CREDD", htcondor.Credd() # noqa: F821
)
thread_local.MONITOR_THREAD_CREDD.add_user_cred(htcondor.CredTypes.Kerberos, None)
logging.info("Getting credd: {}".format(thread_local.MONITOR_THREAD_CREDD))
return thread_local.MONITOR_THREAD_SCHEDD, thread_local.MONITOR_THREAD_CREDD

def stop(backend_job_id):
"""Stop HTCondor job execution."""
try:
schedd = HTCondorJobManagerCERN._get_schedd()
schedd, credd = HTCondorJobManagerCERN._get_schedd()
schedd.act(
htcondor.JobAction.Remove, # noqa: F821
"ClusterId=={}".format(backend_job_id),
Expand All @@ -290,7 +301,7 @@ def stop(backend_job_id):
@retry(stop_max_attempt_number=MAX_NUM_RETRIES, wait_fixed=RETRY_WAIT_TIME)
def spool_output(backend_job_id):
"""Transfer job output."""
schedd = HTCondorJobManagerCERN._get_schedd()
schedd, credd = HTCondorJobManagerCERN._get_schedd()
logging.info("Spooling jobs {} output.".format(backend_job_id))
schedd.retrieve("ClusterId == {}".format(backend_job_id))

Expand All @@ -316,7 +327,7 @@ def get_logs(backend_job_id, workspace):

def find_job_in_history(backend_job_id):
"""Return job if present in condor history."""
schedd = HTCondorJobManagerCERN._get_schedd()
schedd, credd = HTCondorJobManagerCERN._get_schedd()
ads = ["ClusterId", "JobStatus", "ExitCode", "RemoveReason"]
condor_it = schedd.history(
"ClusterId == {0}".format(backend_job_id), ads, match=1
Expand Down
2 changes: 1 addition & 1 deletion reana_job_controller/job_monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -528,7 +528,7 @@ def query_condor_jobs(app, backend_job_ids):
ads = ["ClusterId", "JobStatus", "ExitCode", "ExitStatus", "HoldReasonCode"]
query = format_condor_job_que_query(backend_job_ids)
htcondorcern_job_manager_cls = COMPUTE_BACKENDS["htcondorcern"]()
schedd = htcondorcern_job_manager_cls._get_schedd()
schedd, credd = htcondorcern_job_manager_cls._get_schedd()
logging.info("Querying jobs {}".format(backend_job_ids))
condor_jobs = schedd.xquery(requirements=query, projection=ads)
return condor_jobs

0 comments on commit dce0804

Please sign in to comment.